In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from scipy.stats import randint
import dask
import dask.threaded
from dask import delayed

# Load your feature datasets
pca = pd.read_csv("PCA_features.csv")
mf = pd.read_csv("manual_features.csv")
lasso = pd.read_csv("LASSO_features.csv")
res = pd.read_csv("response.csv")

# Define a list of feature datasets
feature_datasets = [("PCA", pca), ("Manual Features", mf), ("LASSO", lasso)]

# Load the response variable
y = res['avg_salary']

# Initialize an empty dictionary to store results
results = {}

# Define a delayed function for the randomized search
@delayed
def perform_randomized_search(dataset_name, data, y):
    print(dataset_name)
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)
    
    # Initialize the RandomForestRegressor
    rf_regressor = RandomForestRegressor(random_state=42)
    
    # Define hyperparameter search space for n_estimators
    param_dist = {
        'n_estimators': randint(1, 501),
    }
    
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(rf_regressor, param_distributions=param_dist, n_iter=100, 
                                       scoring='neg_mean_squared_error', cv=kf, random_state=42)
    
    random_search.fit(X_train, y_train)
    
    best_rf_model = random_search.best_estimator_
    
    print("search complete")
    
    # Calculate cross-validation RMSE and MAE scores
    cv_rmse_scores = np.sqrt(-cross_val_score(best_rf_model, data, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_mae_scores = -cross_val_score(best_rf_model, data, y, cv=kf, scoring='neg_mean_absolute_error')
    
    # Make predictions on the test set
    predictions = best_rf_model.predict(X_test)
    
    # Calculate RMSE and MAE on the test set
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    
    # Return results
    return {
        "dataset_name": dataset_name,
        "Best Hyperparameters": random_search.best_params_,
        "Cross-Validation RMSE Scores": cv_rmse_scores,
        "Mean Cross-Validation RMSE": cv_rmse_scores.mean(),
        "Cross-Validation MAE Scores": cv_mae_scores,
        "Mean Cross-Validation MAE": cv_mae_scores.mean(),
        "RMSE on Test Set": rmse,
        "MAE on Test Set": mae
    }

# Create a list of delayed tasks for each feature dataset
delayed_tasks = [perform_randomized_search(name, data, y) for name, data in feature_datasets]

# Compute the delayed tasks in parallel
results_list = dask.compute(*delayed_tasks, scheduler='threads')

# Store results in the dictionary
for result in results_list:
    results[result["dataset_name"]] = result


PCA
Manual Features
LASSO
search complete
search complete
search complete


In [4]:
print(results_list)

({'dataset_name': 'PCA', 'Best Hyperparameters': {'n_estimators': 92}, 'Cross-Validation RMSE Scores': array([19.61316201, 20.21878848, 21.07849338, 23.4958912 , 18.13905775,
       13.99408958, 23.86541004, 19.30725082, 20.23802092, 20.99772217]), 'Mean Cross-Validation RMSE': 20.094788636129525, 'Cross-Validation MAE Scores': array([12.88797101, 12.7015942 , 12.56720035, 13.12264982, 12.05552291,
        9.73545828, 14.1586369 , 11.27511751, 13.71276439, 13.7363396 ]), 'Mean Cross-Validation MAE': 12.595325499412457, 'RMSE on Test Set': 20.07725366017056, 'MAE on Test Set': 12.710388094543331}, {'dataset_name': 'Manual Features', 'Best Hyperparameters': {'n_estimators': 202}, 'Cross-Validation RMSE Scores': array([17.71855795, 18.65275495, 22.66868323, 23.79715823, 18.01544619,
       13.38410002, 23.84151786, 14.93435742, 20.57037174, 16.76614498]), 'Mean Cross-Validation RMSE': 19.03490925674156, 'Cross-Validation MAE Scores': array([11.22171617, 10.88023102, 14.50876371, 13.268464

In [5]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Define a function to print the top n features by importance
def print_top_features(importances, feature_names, top_n=10):
    feature_importance = list(zip(feature_names, importances))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    for feature, importance in feature_importance[:top_n]:
        print(f"{feature}: {importance:.4f}")

# Define hyperparameters for each dataset
hyperparameters = {
    'PCA': {'n_estimators': 92},
    'Manual Features': {'n_estimators': 202},
    'LASSO Features': {'n_estimators': 54}
}

# Iterate through each dataset
for dataset_name, data in [("PCA", pca), ("Manual Features", mf), ("LASSO Features", lasso)]:
    # Split the data
    X_train, _, y_train, _ = train_test_split(data, y, test_size=0.2, random_state=42)

    # Create a RandomForestRegressor with the specified hyperparameters
    random_forest_regressor = RandomForestRegressor(
        n_estimators=hyperparameters[dataset_name]['n_estimators'],
        random_state=42
    )
    
    # Fit the Random Forest model
    random_forest_regressor.fit(X_train, y_train)

    # Print the top 10 features by importance
    print(f"Top 10 Features Ranked by Importance for {dataset_name}:")
    print_top_features(random_forest_regressor.feature_importances_, data.columns, top_n=10)
    print("\n")


Top 10 Features Ranked by Importance for PCA:
Principal_Component_3: 0.1599
is_senior: 0.1012
Principal_Component_9: 0.0774
Principal_Component_17: 0.0565
python_yn: 0.0558
Principal_Component_2: 0.0512
Principal_Component_12: 0.0398
Principal_Component_10: 0.0395
Principal_Component_18: 0.0355
Principal_Component_14: 0.0345


Top 10 Features Ranked by Importance for Manual Features:
job_title_analyst: 0.1297
hourly: 0.1263
is_senior: 0.1077
HQ_Longitude: 0.0675
Longitude: 0.0637
job_title_director: 0.0496
desc_len: 0.0492
Log_Population: 0.0462
Rating: 0.0445
age: 0.0383


Top 10 Features Ranked by Importance for LASSO Features:
is_senior: 0.1270
job_simp_analyst: 0.1167
Rating: 0.1024
job_state_CA: 0.0775
python_yn: 0.0597
job_simp_director: 0.0431
Sector_Health Care: 0.0310
Industry_Health Care Services & Hospitals: 0.0248
Job Title_Director II, Data Science - GRM Actuarial: 0.0217
Industry_Enterprise Software & Network Solutions: 0.0209


