In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold, train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from scipy.stats import randint
import dask
import dask.threaded
from dask import delayed

# Load your feature datasets
pca = pd.read_csv("PCA_features.csv")
mf = pd.read_csv("manual_features.csv")
lasso = pd.read_csv("LASSO_features.csv")
res = pd.read_csv("response.csv")

# Define a list of feature datasets
feature_datasets = [("PCA", pca), ("Manual Features", mf), ("LASSO", lasso)]

# Load the response variable
y = res['avg_salary']

# Initialize an empty dictionary to store results
results = {}

# Define a delayed function for the randomized search
@delayed
def perform_randomized_search(dataset_name, data, y):
    print(dataset_name)
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)
    
    # Initialize the XGBoost Regressor
    xgb_regressor = XGBRegressor(random_state=42)
    
    # Define hyperparameter search space for n_estimators
    param_dist = {
        'n_estimators': randint(1, 501),
    }
    
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(xgb_regressor, param_distributions=param_dist, n_iter=100, 
                                       scoring='neg_mean_squared_error', cv=kf, random_state=42)
    
    random_search.fit(X_train, y_train)
    
    best_xgb_model = random_search.best_estimator_
    
    print("search complete")
    
    # Calculate cross-validation RMSE and MAE scores
    cv_rmse_scores = np.sqrt(-cross_val_score(best_xgb_model, data, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_mae_scores = -cross_val_score(best_xgb_model, data, y, cv=kf, scoring='neg_mean_absolute_error')
    
    # Make predictions on the test set
    predictions = best_xgb_model.predict(X_test)
    
    # Calculate RMSE and MAE on the test set
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    
    # Return results
    return {
        "dataset_name": dataset_name,
        "Best Hyperparameters": random_search.best_params_,
        "Cross-Validation RMSE Scores": cv_rmse_scores,
        "Mean Cross-Validation RMSE": cv_rmse_scores.mean(),
        "Cross-Validation MAE Scores": cv_mae_scores,
        "Mean Cross-Validation MAE": cv_mae_scores.mean(),
        "RMSE on Test Set": rmse,
        "MAE on Test Set": mae
    }

# Create a list of delayed tasks for each feature dataset
delayed_tasks = [perform_randomized_search(name, data, y) for name, data in feature_datasets]

# Compute the delayed tasks in parallel
results_list = dask.compute(*delayed_tasks, scheduler='threads')

# Store results in the dictionary
for result in results_list:
    results[result["dataset_name"]] = result

Manual Features
PCA
LASSO
search complete
search complete
search complete


In [4]:
print(results_list)

({'dataset_name': 'PCA', 'Best Hyperparameters': {'n_estimators': 72}, 'Cross-Validation RMSE Scores': array([20.62066905, 17.58010469, 18.0952271 , 23.32811401, 17.61999003,
       11.62401388, 25.7467744 , 15.54170888, 20.81377756, 21.19633134]), 'Mean Cross-Validation RMSE': 19.21667109435179, 'Cross-Validation MAE Scores': array([ 9.39599948,  7.62551503,  8.11597783,  9.07770983,  8.5073378 ,
        5.05743823, 11.26819286,  6.78481664, 11.36541387, 10.64159922]), 'Mean Cross-Validation MAE': 8.784000078922995, 'RMSE on Test Set': 19.320333630669825, 'MAE on Test Set': 9.25872958906545}, {'dataset_name': 'Manual Features', 'Best Hyperparameters': {'n_estimators': 88}, 'Cross-Validation RMSE Scores': array([17.64179262, 16.14933994, 22.35414787, 22.9720448 , 17.42032474,
       11.32339084, 21.54105668, 14.72262454, 21.40249777, 17.27745164]), 'Mean Cross-Validation RMSE': 18.28046714267173, 'Cross-Validation MAE Scores': array([ 9.11335108,  7.37094236, 12.54637538,  9.33997912, 

In [5]:
import numpy as np
from xgboost import XGBRegressor

# Define a function to print the top n features by importance
def print_top_features(importances, feature_names, top_n=10):
    feature_importance = list(zip(feature_names, importances))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    for feature, importance in feature_importance[:top_n]:
        print(f"{feature}: {importance:.4f}")

# Define hyperparameters for each dataset
hyperparameters = {
    'PCA': {'n_estimators': 72},
    'Manual Features': {'n_estimators': 88},
    'LASSO Features': {'n_estimators': 175}
}

# Iterate through each dataset
for dataset_name, data in [("PCA", pca), ("Manual Features", mf), ("LASSO Features", lasso)]:
    # Split the data
    X_train, _, y_train, _ = train_test_split(data, y, test_size=0.2, random_state=42)

    # Create an XGBRegressor with the specified hyperparameters
    xgb_regressor = XGBRegressor(
        n_estimators=hyperparameters[dataset_name]['n_estimators'],
        random_state=42
    )
    
    # Fit the XGBoost model
    xgb_regressor.fit(X_train, y_train)

    # Print the top 10 features by importance
    print(f"Top 10 Features Ranked by Importance for {dataset_name}:")
    print_top_features(xgb_regressor.feature_importances_, data.columns, top_n=10)
    print("\n")


Top 10 Features Ranked by Importance for PCA:
is_senior: 0.1692
Principal_Component_3: 0.1256
python_yn: 0.1036
Principal_Component_17: 0.0548
Principal_Component_2: 0.0480
Principal_Component_9: 0.0468
employer_provided: 0.0448
Principal_Component_15: 0.0313
Principal_Component_14: 0.0287
Principal_Component_16: 0.0275


Top 10 Features Ranked by Importance for Manual Features:
hourly: 0.2804
job_title_analyst: 0.1121
job_title_director: 0.0940
Sector_Oil, Gas, Energy & Utilities: 0.0642
Ownership_Nonprofit Organization: 0.0568
employer_provided: 0.0563
is_senior: 0.0545
python_yn: 0.0445
Sector_Health Care: 0.0322
Sector_Finance: 0.0239


Top 10 Features Ranked by Importance for LASSO Features:
Industry_Health Care Services & Hospitals: 0.1292
Headquarters_Mountain View, CA: 0.1227
Job Title_Data Science Manager: 0.0775
job_simp_analyst: 0.0735
job_simp_director: 0.0655
job_state_CA: 0.0400
Company Name_Takeda Pharmaceuticals
3.7: 0.0331
is_senior: 0.0306
Job Title_Director II, Data 