In [2]:
import pandas as pd
import statsmodels.api as sm

MODEL_PATH = "currentOlsSolution.pkl"
ACTIVATION_PATH = "../../data/activation_data.csv"
TRAIN_PATH = "../../data/training_data.csv"

ols_model = sm.load(MODEL_PATH)
activation_df = pd.read_csv(ACTIVATION_PATH)
train_df = pd.read_csv(TRAIN_PATH)

In [8]:
TARGET = "salary_usd_normalized"

DROP_COLS = [
    "job_id",
    "job_title",
    "skills_required",
    "posting_date",
    "salary_usd"
]

# Rebuild training features (reference)
X_train_ref = train_df.drop(columns=DROP_COLS + [TARGET])
X_train_ref = pd.get_dummies(X_train_ref, drop_first=True)

# Rebuild activation features
X_activation = activation_df.drop(columns=DROP_COLS, errors="ignore")
X_activation = pd.get_dummies(X_activation, drop_first=True)

# Align activation columns to training columns
X_activation = X_activation.reindex(columns=X_train_ref.columns, fill_value=0)

# Add constant EXACTLY once
X_activation = sm.add_constant(X_activation, has_constant="add")

# Ensure numeric dtype
X_activation = X_activation.astype(float)

print("Activation shape:", X_activation.shape)
print("Model expects:", len(ols_model.params), "features")


Activation shape: (1, 11)
Model expects: 11 features


In [9]:
y_pred_norm = ols_model.predict(X_activation)

salary_min = train_df["salary_usd"].min()
salary_max = train_df["salary_usd"].max()

y_pred_usd = y_pred_norm * (salary_max - salary_min) + salary_min

print(f"OLS Predicted Salary (USD): ${y_pred_usd.iloc[0]:,.2f}")

OLS Predicted Salary (USD): $150,517.97


In [10]:
salary_min = train_df["salary_usd"].min()
salary_max = train_df["salary_usd"].max()

y_pred_usd = y_pred_norm * (salary_max - salary_min) + salary_min

print(f"OLS Predicted Salary (USD): ${float(y_pred_usd.iloc[0]):,.2f}")

OLS Predicted Salary (USD): $150,517.97


Create this table once (code or manual):

Model	RMSE	RÂ²
ANN	lower	higher
OLS	higher	lower