In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold, train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

pca = pd.read_csv("PCA_features.csv")
mf = pd.read_csv("manual_features.csv")
res = pd.read_csv("response.csv")
data = pd.merge(pca, mf, left_index=True, right_index=True)

y = res['avg_salary']

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

base_regressor = DecisionTreeRegressor()

adaboost_regressor = AdaBoostRegressor(base_regressor, random_state=42)

param_dist = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(adaboost_regressor, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=kf, random_state=42)
random_search.fit(X_train, y_train)

best_adaboost_model = random_search.best_estimator_

cv_rmse_scores = np.sqrt(-cross_val_score(best_adaboost_model, data, y, cv=kf, scoring='neg_mean_squared_error'))
cv_mae_scores = -cross_val_score(best_adaboost_model, data, y, cv=kf, scoring='neg_mean_absolute_error')

predictions = best_adaboost_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)

print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"Cross-Validation RMSE Scores: {cv_rmse_scores}")
print(f"Mean Cross-Validation RMSE: {cv_rmse_scores.mean()}")
print(f"Cross-Validation MAE Scores: {cv_mae_scores}")
print(f"Mean Cross-Validation MAE: {cv_mae_scores.mean()}")
print(f"RMSE on Test Set: {rmse}")
print(f"MAE on Test Set: {mae}")

Best Hyperparameters: {'n_estimators': 150, 'learning_rate': 0.01}
Cross-Validation RMSE Scores: [17.96357425 15.08376611 20.39806562 23.56725569 17.23819612  9.26523581
 25.69611917 12.95978708 18.13174385 17.2411356 ]
Mean Cross-Validation RMSE: 17.754487929770907
Cross-Validation MAE Scores: [ 8.35333333  6.21333333  9.16216216  9.25        7.2027027   4.60810811
 10.45945946  4.60135135  8.81756757  8.13513514]
Mean Cross-Validation MAE: 7.6803153153153145
RMSE on Test Set: 16.71338530406535
MAE on Test Set: 7.197986577181208
