# Age prediction from MRI features

In [74]:
# libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from utils import visualize, create_dataset_age, create_dataset_mri, cv
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import seaborn as sns
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Create dataset

In [3]:
target = 'Age'
# use all MRI high-level features, no DTI
data = create_dataset_mri(SCORE = target)
# remove the diagnosis colums
data.drop(columns=['DX_01_Cat', 'DX_01', 'DX_01_Sub'], inplace=True)

In [46]:
test_indices = pd.read_csv('data/test_indices.csv')
# Separate test and train set
data_test = pd.merge(data, test_indices, on='ID', how='inner')
data_train = data.loc[~data['ID'].isin(list((set(test_indices['ID']))))]

In [50]:
# labels and features
y_train = data_train[target]
X_train = data_train.drop([target, 'ID'], axis=1)
y_test = data_test[target]
X_test = data_test.drop([target, 'ID'], axis=1)

In [52]:
column_names = X_train.columns

## Fix a baseline (mean age)

In [58]:
# Baseline
mean = np.mean(y_train)
print('mean_age = {}'.format(mean))
baseline_MSE = sum((mean - y_train)**2)/len(y_train)
print('baseline_MSE = {}'.format(baseline_MSE))

mean_age = 10.769986650924025
baseline_MSE = 13.613808757061083


# Explore different regressors and feature selection procedures

## SVR with no feature selection

In [59]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('regression', SVR())])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.5, 1],
            'regression__epsilon' : [0.1, 0.5, 0.7],
            'regression__kernel' : ['rbf']}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.6s finished


Mean cross-validated score of the best estimator: 6.709482422552265
Selected hyperparameters: {'regression__C': 0.5, 'regression__epsilon': 0.7, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 6.709482422552265
True test error: 5.445450380644605


## SelectKBest + SVR


In [60]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectKBest()),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [1],
            'regression__epsilon' : [0.1, 0.5, 0.7],
            'regression__kernel' : ['rbf'],
            'feat_select__score_func' : [f_regression],
            'feat_select__k' : [10, 20, 40, 80, 95]}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  52 out of  75 | elapsed:    0.8s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    1.1s finished


Mean cross-validated score of the best estimator: 7.7755761723861525
Selected hyperparameters: {'feat_select__k': 10, 'feat_select__score_func': <function f_regression at 0x7fa759aba730>, 'regression__C': 1, 'regression__epsilon': 0.1, 'regression__kernel': 'rbf'}
Split: 1Split: 2Split: 3Split: 4Split: 5
Average expected test MSE: 7.775576172386151
True test error: 6.210321452613395


## ExtraTrees + SVR

In [61]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor(100))),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.3, 1, 1.5],
            'regression__epsilon' : [0.05, 0.1, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__estimator__n_estimators' : [100],}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   33.7s finished


Mean cross-validated score of the best estimator: 6.1947240734490405
Selected hyperparameters: {'feat_select__estimator__n_estimators': 100, 'regression__C': 0.3, 'regression__epsilon': 0.05, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 6.120492336319738
True test error: 5.138094886215391


In [62]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor(100))),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.3, 1, 1.5],
            'regression__epsilon' : [0.05, 0.1, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__estimator__n_estimators' : [100]}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   36.7s finished


Mean cross-validated score of the best estimator: 6.2497726989661135
Selected hyperparameters: {'feat_select__estimator__n_estimators': 100, 'regression__C': 0.3, 'regression__epsilon': 0.1, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 6.1740307610545235
True test error: 5.176076343417024


## RandomForest

In [63]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('regression', RandomForestRegressor())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__max_depth' : [2, 3, 6, 12],
            'regression__min_samples_leaf' : [1, 3, 5, 8]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    6.6s finished


Mean cross-validated score of the best estimator: 8.617530716446783
Selected hyperparameters: {'regression__max_depth': 2, 'regression__min_samples_leaf': 1}
Split: 5
Average expected test MSE: 8.468681272169247
True test error: 7.220620939182452


In [64]:
k = 10
weights = model.best_estimator_.steps[1][1].feature_importances_
kth_weight = np.sort(weights)[::-1][k]
k_most_relevant = column_names[weights>= kth_weight]
k_most_relevant

Index(['lh_G.S_frontomargin_thickness', 'lh_G.S_occipital_inf_thickness',
       'lh_G.S_paracentral_thickness', 'lh_G.S_subcentral_thickness',
       'lh_G.S_transv_frontopol_thickness', 'lh_G.S_cingul.Ant_thickness',
       'lh_G.S_cingul.Mid.Ant_thickness', 'lh_G.S_cingul.Mid.Post_thickness',
       'lh_G_cingul.Post.dorsal_thickness',
       'lh_G_cingul.Post.ventral_thickness',
       ...
       'rh_rostralmiddlefrontal_volume', 'rh_superiorfrontal_volume',
       'rh_superiorparietal_volume', 'rh_superiortemporal_volume',
       'rh_supramarginal_volume', 'rh_frontalpole_volume',
       'rh_temporalpole_volume', 'rh_transversetemporal_volume',
       'rh_insula_volume', 'GlobalCorticalThickness'],
      dtype='object', length=369)

## ExtraTrees + XGBoost

In [65]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__estimator__n_estimators' : [80, 100, 120],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 4, 5, 6],
              'regression__booster__alpha' : [0.05, 0.2, 0.5]
              #'regression__booster__max_depth' : [3, 4, 5, 6, 7]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:  1.8min finished


Mean cross-validated score of the best estimator: 5.526072228192527
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 1, 'feat_select__estimator__n_estimators': 80, 'regression__booster__alpha': 0.05}

Average expected test MSE: 5.5032408918160165
True test error: 4.597963756967613


## Lasso

In [66]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('regression', Lasso())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__alpha' : [1, 1.2, 1.4, 1.7, 1.9, 2, 2.3, 2.5, 3, 3.5, 4, 6],
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Mean cross-validated score of the best estimator: 13.739976826962371
Selected hyperparameters: {'regression__alpha': 1.9}
Split: 1Split: 2Split: 3Split: 4Split: 5
Average expected test MSE: 13.739976826962373
True test error: 14.3209324643211


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.7s finished


## ExtraTrees + Lasso

In [67]:
pipe = Pipeline([ ('scaling', StandardScaler()),
                 ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', Lasso())
])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__estimator__n_estimators' : [80, 100],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 4],
              'feat_select__threshold' : [-np.inf],
              'feat_select__max_features' : [10, 20, 30, 70, 90, 110],
              'regression__alpha' : [0.5, 1, 1.5, 2, 2.5, 3],
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  8.8min finished


Mean cross-validated score of the best estimator: 13.739976826962371
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 1, 'feat_select__estimator__n_estimators': 80, 'feat_select__max_features': 10, 'feat_select__threshold': -inf, 'regression__alpha': 2}
Split: 5
Average expected test MSE: 13.739976826962373
True test error: 14.3209324643211


## SelectKBEST + XGBoost

In [68]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectKBest()),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__score_func' : [f_regression],
            'feat_select__k' : [10, 30, 60, 100, 120, 160],
              'regression__booster__alpha' : [0, 0.05, 0.1, 0.2, 0.6, 1],
              'regression__booster__max_depth' : [3, 4, 5, 6, 7]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 601 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   57.9s finished


Mean cross-validated score of the best estimator: 8.285576482110569
Selected hyperparameters: {'feat_select__k': 10, 'feat_select__score_func': <function f_regression at 0x7fa759aba730>, 'regression__booster__alpha': 0, 'regression__booster__max_depth': 3}

Average expected test MSE: 8.285576482110567
True test error: 6.416581002334373


# MRI + DTI

In [69]:
target = 'Age'
# use all MRI high-level features, no DTI
data = create_dataset_mri(SCORE = target, DTI = True)
# for the moment, remove the diagnosis colums
try:
    data.drop(columns=['DX_01_Cat'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01_Sub'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01'], inplace=True)
except:
    pass

# labels and features
y = data[target]
X = data.drop([target, 'ID'], axis=1)
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## SelectKBest + SVR

In [70]:
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectKBest()),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
            'regression__C' : [0.2, 0.5, 0.8, 1, 1.5, 2.5],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__score_func' : [f_regression, mutual_info_regression],
            'feat_select__k' : [10, 30, 80, 100, 120, 160]}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 524 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 774 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1124 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1574 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 2124 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 2774 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 3524 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 14.2min finished


Mean cross-validated score of the best estimator: 8.887435232673859
Selected hyperparameters: {'feat_select__k': 160, 'feat_select__score_func': <function mutual_info_regression at 0x7fa759a52730>, 'imputation__strategy': 'mean', 'regression__C': 0.2, 'regression__epsilon': 0.05, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 8.878170274415526
True test error: 8.785654315666354


## ExtraTrees + SVR

In [71]:
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor(100))),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
              'regression__C' : [0.3, 0.5, 0.8, 1, 1.5, 2],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__estimator__n_estimators' : [100],
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.8min finished


Mean cross-validated score of the best estimator: 6.911194891050914
Selected hyperparameters: {'feat_select__estimator__n_estimators': 100, 'imputation__strategy': 'mean', 'regression__C': 0.3, 'regression__epsilon': 0.1, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 6.858542234247179
True test error: 6.398964606842202


## ExtraTrees + XGBoost

In [72]:
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
              'feat_select__estimator__n_estimators' : [80, 100, 120],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 4, 5, 6],
              'regression__booster__alpha' : [0.05, 0.2, 0.5]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed:  3.4min finished


Mean cross-validated score of the best estimator: 5.969363604014561
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 6, 'feat_select__estimator__n_estimators': 120, 'imputation__strategy': 'mean', 'regression__booster__alpha': 0.5}

Average expected test MSE: 5.798785211840278
True test error: 5.9898162720166725


## RandomForest

In [73]:
pipe = Pipeline([('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', RandomForestRegressor())
])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
              'feat_select__estimator__n_estimators' : [50, 100],
              'feat_select__estimator__min_samples_leaf' : [1, 5, 8],
            'regression__max_depth' : [2, 3, 6, 8],
            'regression__min_samples_leaf' : [1, 3, 5, 8]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  5.2min finished


Mean cross-validated score of the best estimator: 8.399436322812138
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 8, 'feat_select__estimator__n_estimators': 100, 'imputation__strategy': 'most_frequent', 'regression__max_depth': 2, 'regression__min_samples_leaf': 3}
Split: 5
Average expected test MSE: 8.397745416653773
True test error: 8.253370360974069
