In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
import shap

In [2]:
df = pd.read_csv("./data/model_data.csv")
df = df.drop(columns=["ID", "WS"])
df_forest = df.dropna()
y_col = "salary_perc"

In [3]:
model = RandomForestRegressor()

In [4]:
param_dist = {
    "n_estimators": [5, 10, 20, 30, 40, 50],
    "max_depth": [2, 5, 15, 20, 25, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4, 10],
    "bootstrap": [True, False]
}

In [5]:
random_search = RandomizedSearchCV(model, param_dist, cv=5, verbose=2, n_jobs=-1, scoring="neg_mean_absolute_error")

In [None]:
random_search.fit(df_forest.drop(columns=[y_col]), df_forest[y_col])

In [None]:
results_random = pd.DataFrame(random_search.cv_results_)
results_random.sort_values(by="rank_test_score", inplace=True)
results_random

In [8]:
grid_search = GridSearchCV(model, param_dist, cv=5, verbose=2, n_jobs=-1, scoring="neg_mean_absolute_error")

In [None]:
grid_search.fit(df_forest.drop(columns=[y_col]), df_forest[y_col])

In [None]:
# for prediction:
results_grid = pd.DataFrame(grid_search.cv_results_)
results_grid.sort_values(by="rank_test_score", inplace=True)
results_grid

In [12]:
print(f"Best MAE from random search: {random_search.best_score_}")
print(f"Best params from random search: {random_search.best_params_}")

Best MAE from random search: -0.03150639659081355
Best params from random search: {'n_estimators': 10, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 30, 'bootstrap': True}


In [13]:
print(f"Best MAE from grid search: {grid_search.best_score_}")
print(f"Best params from grid search: {grid_search.best_params_}")

Best MAE from grid search: -0.030548575751998364
Best params from grid search: {'bootstrap': True, 'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}


In [14]:
df_shap = df.dropna()
y = df["salary_perc"]
X = df.drop(columns="salary_perc")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor()