In [1]:
# Utilities
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Models
from sklearn.linear_model import GammaRegressor, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor


# Tuning
from sklearn.model_selection import GridSearchCV

# Multioutput
from sklearn.multioutput import MultiOutputRegressor as MOR

# Scoring
from sklearn.metrics import mean_absolute_error

# Model Persistence
from joblib import dump, load

In [2]:
# Data loading
df = pd.read_csv('../../train_with_zip_pop_weather825000.csv')

features = ["Hour", "Weekend", "Month", "radius_in_miles", "population", 
            "population_density", "land_area_in_sqmi", "water_area_in_sqmi", 
            "housing_units", "occupied_housing_units", "median_home_value", 
            "median_household_income", "temp", "dwpt", "rhum", "prcp", 
            "wdir", "wspd", "pres", "coco"]

percentiles = ['p20', 'p40', 'p50', 'p60', 'p80']
targets = [f"TotalTimeStopped_{percentile}" for percentile in percentiles]

X = df[features]
y = df[targets]

X_selection, y_selection = X.sample(n=10000), y.sample(n=10000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [6]:
# Preprocessing
preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),    # All features are numerical
        ("decomposition", PCA())
    ]
)

# Model fitting
def fit_model(model, X, y, grid_search=True):
    print(f"Fitting {model['regr']}")
    pipe = Pipeline(
        steps=[
            ('preprocess', preprocessor),
            ('regression', MOR(model['regr']))
        ]
    )
    param_grid = model['params']
    
    if grid_search:
        gs = GridSearchCV(estimator=pipe,
                          param_grid=param_grid,
                          cv=3,
                          scoring="neg_mean_absolute_error",
                          n_jobs=-1,
                          verbose=2
                         )

        gs.fit(X, y)        
        return gs
    else:
        pipe.fit(X, y)
        return pipe

In [5]:
# Models and desired parameters
models = {
    'sgd': {
        'regr': SGDRegressor(),
        'params': {
            'preprocess__decomposition__n_components': [2, 3, 5],
            'regression__estimator__penalty': ["l2", "l1"],
            'regression__estimator__alpha': [.0001, .0005, .001],
            'regression__estimator__learning_rate': ["optimal"],
            'regression__estimator__max_iter': [10000]
        }
    },
    'rfr': {
        'regr': RandomForestRegressor(),
        'params': {
            'preprocess__decomposition__n_components': [2, 3, 5],
            'regression__estimator__n_estimators': [50, 100, 250],
            'regression__estimator__min_samples_leaf': [1, 5, 25]     
        }
    },
    'gbr': {
        'regr': GradientBoostingRegressor(),
        'params': {
            'preprocess__decomposition__n_components': [2, 3, 5],
            'regression__estimator__learning_rate': [50, 100, 250],
            'regression__estimator__n_estimators': [50, 100, 250] 
        }
    },
    'abr': {
        'regr': AdaBoostRegressor(),
        'params': {
            'preprocess__decomposition__n_components': [2, 3, 5],
            'regression__estimator__learning_rate': [50, 100, 250],
            'regression__estimator__n_estimators': [25, 50, 100]
        }
    }
}

In [7]:
# Model selection and hyperparameter tuning
gs_models = {}
for model in models:
    gs_models[model] = fit_model(models[model], X_selection, y_selection)
    print(f"\nBest score for {models[model]['regr']}: {gs_models[model].best_score_}") 
    print(f"Best parameters for {models[model]['regr']}: {gs_models[model].best_params_}")    
    print('#############################################################')

Fitting SGDRegressor()
Fitting 3 folds for each of 18 candidates, totalling 54 fits

Best score for SGDRegressor(): -1.2879217563110025e+19
Best parameters for SGDRegressor(): {'preprocess__decomposition__n_components': 3, 'regression__estimator__alpha': 0.001, 'regression__estimator__learning_rate': 'optimal', 'regression__estimator__max_iter': 10000, 'regression__estimator__penalty': 'l2'}
#############################################################
Fitting RandomForestRegressor()
Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best score for RandomForestRegressor(): -11.461904528346073
Best parameters for RandomForestRegressor(): {'preprocess__decomposition__n_components': 2, 'regression__estimator__min_samples_leaf': 25, 'regression__estimator__n_estimators': 250}
#############################################################
Fitting GradientBoostingRegressor()
Fitting 3 folds for each of 27 candidates, totalling 81 fits


 -9.12898503e+195              nan -4.40781424e+119 -9.81934722e+234
              nan -2.34514844e+084 -2.46455961e+163              nan
 -4.38608366e+099 -4.95639742e+192              nan -4.68014884e+119
 -3.93760465e+226              nan -2.14885769e+084 -2.01375823e+161
              nan -4.01881752e+099 -1.11112522e+186              nan
 -4.28819820e+119 -3.18199256e+226              nan]
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights



Best score for GradientBoostingRegressor(): -2.148857687731374e+84
Best parameters for GradientBoostingRegressor(): {'preprocess__decomposition__n_components': 5, 'regression__estimator__learning_rate': 50, 'regression__estimator__n_estimators': 50}
#############################################################
Fitting AdaBoostRegressor()
Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best score for AdaBoostRegressor(): -12.479259173667819
Best parameters for AdaBoostRegressor(): {'preprocess__decomposition__n_components': 2, 'regression__estimator__learning_rate': 100, 'regression__estimator__n_estimators': 100}
#############################################################


In [8]:
'''
Full training, use the same abbreviation used in models dictionary above
for top-level key and maintanin the dictionary structure as shown below
'''
chosen_models = {
    'abr': {
        'regr': None,
        'params': None,
        'model': None,
        'mae': None
    },
    'rfr': {
        'regr': None,
        'params': None,
        'model': None,
        'mae': None
    }
}


for model in chosen_models:
    chosen_models[model]['regr'] = models[model]['regr']
    chosen_models[model]['params'] = gs_models[model].best_params_.copy()
    
    chosen_models[model]['model'] = fit_model(chosen_models[model], X_train, y_train, grid_search=False)
    preds = chosen_models[model]['model'].predict(X_test)
    chosen_models[model]['mae'] = mean_absolute_error(y_test, preds)
    
    dump(chosen_models[model], f'./model_cache/{model}.joblib') 
    mae = chosen_models[model]['mae']
    print(f'{model} score: {mae}')

Fitting AdaBoostRegressor()
Fitting RandomForestRegressor()


In [23]:
# Loading, using stored models
for model in chosen_models:
    load_model = load(f'./model_cache/{model}.joblib')
    preds = load_model['model'].predict(X_test)
    print(mean_absolute_error(y_test, preds))

31.076917667779583
11.487551477587072
