# Age prediction from MRI features

In [31]:
# libraries
import matplotlib.pyplot as plt
import numpy as np
from utils import visualize, create_dataset_age, create_dataset_mri, cv
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import seaborn as sns
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso

In [32]:
import warnings
warnings.filterwarnings('ignore')

## Create dataset

In [33]:
target = 'Age'
# use all MRI high-level features, no DTI
data = create_dataset_mri(SCORE = target)
# for the moment, remove the diagnosis colums
try:
    data.drop(columns=['DX_01_Cat'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01_Sub'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01'], inplace=True)
except:
    pass

In [34]:
# Drop the rows with some NaNs
# print(data.shape)
# data.dropna(axis=0, inplace=True)
# print(data.shape)

In [35]:
# labels and features
y = data[target]
X = data.drop([target, 'ID'], axis=1)

## Fix a baseline (mean age)

In [17]:
# Baseline
mean = np.mean(y)
print('mean_age = {}'.format(mean))
baseline_MSE = sum((mean - y)**2)/len(y)
print('baseline_MSE = {}'.format(baseline_MSE))

mean_age = 10.719744772251309
baseline_MSE = 13.723839851203406


# Explore different regressors and feature selection procedures

In [36]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## SVR with no feature selection

In [19]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('regression', SVR())])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.2, 0.3, 0.5, 0.8, 1],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5, 0.7],
            'regression__kernel' : ['rbf', 'poly']}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   27.7s finished


Mean cross-validated score of the best estimator: 40.86264295031016
Selected hyperparameters: {'regression__C': 1, 'regression__epsilon': 0.1, 'regression__kernel': 'poly'}
Split: 6
Average expected test MSE: 43.25064774716446
True test error: 17.239127508259834


## SelectKBest + SVR


In [44]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectKBest()),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.5, 0.8],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__score_func' : [f_regression, mutual_info_regression],
            'feat_select__k' : [10, 20, 40, 80, 95]}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   36.9s


Mean cross-validated score of the best estimator: 7.625894965403726
Selected hyperparameters: {'feat_select__k': 10, 'feat_select__score_func': <function f_regression at 0x7fb656a33730>, 'regression__C': 0.8, 'regression__epsilon': 0.1, 'regression__kernel': 'rbf'}
Split: 1Split: 2Split: 3Split: 4Split: 5
Average expected test MSE: 7.625894965403726
True test error: 7.719118076865477


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  1.6min finished


## ExtraTrees + SVR

In [39]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor(100))),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.3, 0.5, 0.8, 1, 1.5],
            'regression__epsilon' : [0.01, 0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__estimator__n_estimators' : [100, 130, 180],
            'feat_select__threshold' : [-np.inf],
            'feat_select__max_features' : [10, 20, 40, 140, 230]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 375 candidates, totalling 1875 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 1875 out of 1875 | elapsed: 25.7min finished


Mean cross-validated score of the best estimator: 7.304901016006843
Selected hyperparameters: {'feat_select__estimator__n_estimators': 130, 'feat_select__max_features': 10, 'feat_select__threshold': -inf, 'regression__C': 1.5, 'regression__epsilon': 0.1, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 6.959375581824299
True test error: 7.75742927858555


## ExtraTrees + RandomForest

In [22]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', RandomForestRegressor())
])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__estimator__n_estimators' : [50, 80, 100],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 5, 8],
            'feat_select__threshold' : [-np.inf],
            'feat_select__max_features' : [40, 140, 180, 200, 230],
            'regression__max_depth' : [2, 3, 6, 12],
            'regression__min_samples_leaf' : [1, 3, 5, 8]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   59.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 19.3min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed: 23.1min
[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed: 26.0min finished


Mean cross-validated score of the best estimator: 8.459687844612185
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 1, 'feat_select__estimator__n_estimators': 100, 'feat_select__max_features': 40, 'feat_select__threshold': -inf, 'regression__max_depth': 2, 'regression__min_samples_leaf': 8}
Split: 5
Average expected test MSE: 7.966993533796078
True test error: 8.200068968458565


## ExtraTrees + XGBoost

In [37]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__estimator__n_estimators' : [80, 100, 120],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 4, 5, 6],
              'feat_select__threshold' : [-np.inf],
              'feat_select__max_features' : [10, 20, 60, 70],
              'regression__booster__alpha' : [0.05, 0.2, 0.5]
              #'regression__booster__max_depth' : [3, 4, 5, 6, 7]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  5.8min finished


Mean cross-validated score of the best estimator: 7.699949371158411
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 4, 'feat_select__estimator__n_estimators': 100, 'feat_select__max_features': 10, 'feat_select__threshold': -inf, 'regression__booster__alpha': 0.5}

Average expected test MSE: 7.4207154873361585
True test error: 8.326421100468648


## Lasso

In [24]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('regression', Lasso())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__alpha' : [1, 1.2, 1.4, 1.7, 1.9, 2, 2.3, 2.5, 3, 3.5, 4, 6],
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Mean cross-validated score of the best estimator: 13.702507670908801
Selected hyperparameters: {'regression__alpha': 2}
Split: 1Split: 2Split: 3Split: 4Split: 5
Average expected test MSE: 13.702507670908798
True test error: 13.869415734497567


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished


## ExtraTrees + Lasso

In [25]:
pipe = Pipeline([ ('scaling', StandardScaler()),
                 ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', Lasso())
])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__estimator__n_estimators' : [80, 100],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 4],
              'feat_select__threshold' : [-np.inf],
              'feat_select__max_features' : [10, 20, 30, 70, 90, 110],
              'regression__alpha' : [0.5, 1, 1.5, 2, 2.5, 3],
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  7.6min finished


Mean cross-validated score of the best estimator: 13.702507670908801
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 1, 'feat_select__estimator__n_estimators': 80, 'feat_select__max_features': 10, 'feat_select__threshold': -inf, 'regression__alpha': 2}
Split: 5
Average expected test MSE: 13.702507670908798
True test error: 13.869415734497567


## SelectKBEST + XGBoost

In [26]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectKBest()),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__score_func' : [f_regression],
            'feat_select__k' : [10, 30, 60, 100, 120, 160],
              'regression__booster__alpha' : [0, 0.05, 0.1, 0.2, 0.6, 1],
              'regression__booster__max_depth' : [3, 4, 5, 6, 7]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 603 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   50.5s finished


Mean cross-validated score of the best estimator: 8.141871573760335
Selected hyperparameters: {'feat_select__k': 10, 'feat_select__score_func': <function f_regression at 0x7fb656a33730>, 'regression__booster__alpha': 0, 'regression__booster__max_depth': 3}

Average expected test MSE: 8.141871573760339
True test error: 8.18585807438065


## CDE

## LGBM

# MRI + DTI

In [45]:
target = 'Age'
# use all MRI high-level features, no DTI
data = create_dataset_mri(SCORE = target, DTI = True)
# for the moment, remove the diagnosis colums
try:
    data.drop(columns=['DX_01_Cat'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01_Sub'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01'], inplace=True)
except:
    pass

# labels and features
y = data[target]
X = data.drop([target, 'ID'], axis=1)
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## SelectKBest + SVR

In [46]:
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectKBest()),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
            'regression__C' : [0.2, 0.5, 0.8, 1, 1.5, 2.5],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__score_func' : [f_regression, mutual_info_regression],
            'feat_select__k' : [10, 30, 80, 100, 120, 160]}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 524 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 774 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1124 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1574 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 2124 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 2774 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 3524 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 13.9min finished


Mean cross-validated score of the best estimator: 8.875603040249226
Selected hyperparameters: {'feat_select__k': 160, 'feat_select__score_func': <function mutual_info_regression at 0x7fb656a4d730>, 'imputation__strategy': 'median', 'regression__C': 0.2, 'regression__epsilon': 0.05, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 8.851096243661612
True test error: 8.842062973417255


## ExtraTrees + SVR

In [47]:
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor(100))),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
              'regression__C' : [0.3, 0.5, 0.8, 1, 1.5, 2],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__estimator__n_estimators' : [100, 130],
            'feat_select__threshold' : [-np.inf],
            'feat_select__max_features' : [10, 30, 80, 100, 130]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 19.4min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 30.6min finished


Mean cross-validated score of the best estimator: 7.630540341157975
Selected hyperparameters: {'feat_select__estimator__n_estimators': 100, 'feat_select__max_features': 10, 'feat_select__threshold': -inf, 'imputation__strategy': 'median', 'regression__C': 0.3, 'regression__epsilon': 0.05, 'regression__kernel': 'rbf'}
Split: 5
Average expected test MSE: 7.378802196203989
True test error: 7.147595633706364


## ExtraTrees + XGBoost

In [48]:
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
              'feat_select__estimator__n_estimators' : [80, 100],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 4, 5, 6, 8],
              'feat_select__threshold' : [-np.inf],
              'feat_select__max_features' : [10, 20, 50, 70, 100, 140],
              'regression__booster__alpha' : [0, 0.05, 0.1, 0.2],
              'regression__booster__max_depth' : [3, 4, 5, 6, 7]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

Fitting 5 folds for each of 4320 candidates, totalling 21600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 6026 tasks      | elapsed: 36.2min
[Parallel(n_jobs=-1)]: Done 7176 tasks      | elapsed: 42.4min
[Parallel(n_jobs=-1)]: Done 8426 tasks      | elapsed: 46.2min
[Parallel(n_jobs=-1)]: Done 9776 tasks      | elapsed: 50.9min
[Parallel(n_jobs=-1)]: Done 11226 tasks      

Mean cross-validated score of the best estimator: 8.166805203413324
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 2, 'feat_select__estimator__n_estimators': 80, 'feat_select__max_features': 10, 'feat_select__threshold': -inf, 'imputation__strategy': 'median', 'regression__booster__alpha': 0.2, 'regression__booster__max_depth': 6}

Average expected test MSE: 7.970171129707256
True test error: 7.526156153724901


## ExtraTrees + RandomForest

In [None]:
pipe = Pipeline([('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', RandomForestRegressor())
])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['mean', 'median', 'most_frequent'],
              'feat_select__estimator__n_estimators' : [50, 100],
              'feat_select__estimator__min_samples_leaf' : [1, 5, 8],
            'feat_select__threshold' : [-np.inf],
            'feat_select__max_features' : [100],
            'regression__max_depth' : [2, 3, 6, 8],
            'regression__min_samples_leaf' : [1, 3, 5, 8]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))