In [22]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold, train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from scipy.stats import uniform, randint
import dask
import dask.threaded
from dask import delayed

# Load your feature datasets
pca = pd.read_csv("PCA_features.csv")
mf = pd.read_csv("manual_features.csv")
lasso = pd.read_csv("LASSO_features.csv")
res = pd.read_csv("response.csv")

# Define a list of feature datasets
feature_datasets = [("PCA", pca), ("Manual Features", mf), ("LASSO", lasso)]

# Load the response variable
y = res['avg_salary']

# Initialize an empty dictionary to store results
results = {}

# Define a delayed function for the randomized search
@delayed
def perform_randomized_search(dataset_name, data, y):
    print(dataset_name)
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)
    
    # Initialize the base regressor and AdaBoostRegressor
    base_regressor = DecisionTreeRegressor()
    adaboost_regressor = AdaBoostRegressor(base_regressor, random_state=42)
    
    # Define hyperparameter search space
    param_dist = {
        'n_estimators': randint(1, 501),
        'learning_rate': uniform(0, 1),
    }
    
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(adaboost_regressor, param_distributions=param_dist, n_iter=100, 
                                       scoring='neg_mean_squared_error', cv=kf, random_state=42)
    
    random_search.fit(X_train, y_train)
    
    best_adaboost_model = random_search.best_estimator_
    
    print("search complete")
    
    # Calculate cross-validation RMSE and MAE scores
    cv_rmse_scores = np.sqrt(-cross_val_score(best_adaboost_model, data, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_mae_scores = -cross_val_score(best_adaboost_model, data, y, cv=kf, scoring='neg_mean_absolute_error')
    
    # Make predictions on the test set
    predictions = best_adaboost_model.predict(X_test)
    
    # Calculate RMSE and MAE on the test set
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    
    # Return results
    return {
        "dataset_name": dataset_name,
        "Best Hyperparameters": random_search.best_params_,
        "Cross-Validation RMSE Scores": cv_rmse_scores,
        "Mean Cross-Validation RMSE": cv_rmse_scores.mean(),
        "Cross-Validation MAE Scores": cv_mae_scores,
        "Mean Cross-Validation MAE": cv_mae_scores.mean(),
        "RMSE on Test Set": rmse,
        "MAE on Test Set": mae
    }

# Create a list of delayed tasks for each feature dataset
delayed_tasks = [perform_randomized_search(name, data, y) for name, data in feature_datasets]

# Compute the delayed tasks in parallel
results_list = dask.compute(*delayed_tasks, scheduler='threads')

# Store results in the dictionary
for result in results_list:
    results[result["dataset_name"]] = result






PCAManual Features
LASSO

search complete
search complete
search complete


In [23]:
for dataset_name, dataset_dict in results.items():
    print(f"Dataset: {dataset_name}")
    for key, value in dataset_dict.items():
        if isinstance(value, dict):
            print(f"{key}:")
            for sub_key, sub_value in value.items():
                if isinstance(sub_value, (list, np.ndarray)):
                    print(f"  {sub_key}: {sub_value.tolist()}")
                else:
                    print(f"  {sub_key}: {sub_value}")
        elif isinstance(value, (list, np.ndarray)):
            print(f"{key}: {value.tolist()}")
        else:
            print(f"{key}: {value}")
    print("\n")


Dataset: PCA
dataset_name: PCA
Best Hyperparameters:
  learning_rate: 0.8036720768991145
  n_estimators: 44
Cross-Validation RMSE Scores: [18.810901094843917, 19.79292802998081, 21.305452290371527, 21.837467801922458, 16.356356727749777, 9.288360923685548, 20.737581761075646, 16.264702288715295, 19.45837553287474, 18.338686769570838]
Mean Cross-Validation RMSE: 18.21908132207906
Cross-Validation MAE Scores: [8.42, 8.093333333333334, 9.304054054054054, 7.547297297297297, 7.4391891891891895, 3.9932432432432434, 8.864864864864865, 6.162162162162162, 10.04054054054054, 8.641891891891891]
Mean Cross-Validation MAE: 7.850657657657658
RMSE on Test Set: 19.824650779357754
MAE on Test Set: 9.10738255033557


Dataset: Manual Features
dataset_name: Manual Features
Best Hyperparameters:
  learning_rate: 0.9717120953891037
  n_estimators: 131
Cross-Validation RMSE Scores: [16.202263216394595, 18.11868280716528, 19.671027536057295, 23.68936070920469, 14.752576504242013, 10.813530063417424, 20.798908

In [25]:
# Define a function to print the top n features by importance
def print_top_features(importances, feature_names, top_n=10):
    feature_importance = list(zip(feature_names, importances))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    for feature, importance in feature_importance[:top_n]:
        print(f"{feature}: {importance:.4f}")

# Initialize the base regressor (DecisionTreeRegressor) for AdaBoost
base_regressor = DecisionTreeRegressor()

# Define hyperparameters for each dataset
hyperparameters = {
    'PCA': {'n_estimators': 44, 'learning_rate': 0.8036720768991145},
    'Manual Features': {'n_estimators': 131, 'learning_rate': 0.9717120953891037},
    'LASSO Features': {'n_estimators': 346, 'learning_rate': 0.7106628896857874}
}

# Iterate through each dataset
for dataset_name, data in [("PCA", pca), ("Manual Features", mf), ("LASSO Features", lasso)]:
    # Split the data
    X_train, _, y_train, _ = train_test_split(data, y, test_size=0.2, random_state=42)

    # Create an AdaBoostRegressor with the specified hyperparameters
    ada_boost_regressor = AdaBoostRegressor(
        base_regressor,
        n_estimators=hyperparameters[dataset_name]['n_estimators'],
        learning_rate=hyperparameters[dataset_name]['learning_rate'],
        random_state=42
    )
    
    # Fit the AdaBoost model
    ada_boost_regressor.fit(X_train, y_train)

    # Print the top 10 features by importance
    print(f"Top 10 Features Ranked by Importance for {dataset_name}:")
    print_top_features(ada_boost_regressor.feature_importances_, data.columns, top_n=10)
    print("\n")

Top 10 Features Ranked by Importance for PCA:
Principal_Component_3: 0.1243
is_senior: 0.0843
Principal_Component_9: 0.0800
Principal_Component_17: 0.0600
Principal_Component_16: 0.0528
Principal_Component_10: 0.0439
Principal_Component_12: 0.0421
Principal_Component_11: 0.0365
Principal_Component_15: 0.0363
python_yn: 0.0355


Top 10 Features Ranked by Importance for Manual Features:
Longitude: 0.0827
is_senior: 0.0779
job_title_analyst: 0.0702
hourly: 0.0673
desc_len: 0.0656
job_title_director: 0.0600
Rating: 0.0582
HQ_Longitude: 0.0580
age: 0.0507
Log_Population: 0.0477


Top 10 Features Ranked by Importance for LASSO Features:
Rating: 0.0967
python_yn: 0.0802
job_state_CA: 0.0693
is_senior: 0.0680
job_simp_director: 0.0605
job_simp_analyst: 0.0453
Job Title_Data Science Manager: 0.0399
Location_San Francisco, CA: 0.0296
Revenue_$5 to $10 million (USD): 0.0212
Competitors_-1: 0.0210


