# Age prediction from MRI features

In [50]:
# libraries
import matplotlib.pyplot as plt
import numpy as np
from utils import visualize, create_dataset_age, create_dataset_mri, cv
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import seaborn as sns
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import xgboost as xgb

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Create dataset

In [4]:
target = 'Age'
# use all MRI high-level features, no DTI
data = create_dataset_mri(SCORE = target)
# for the moment, remove the diagnosis colums
try:
    data.drop(columns=['DX_01_Cat'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01_Sub'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01'], inplace=True)
except:
    pass

In [5]:
# Drop the rows with some NaNs
print(data.shape)
data.dropna(axis=0, inplace=True)
print(data.shape)
column_names = data.columns[2:]

(1146, 371)
(1146, 371)


In [7]:
# labels and features
y = data[target]
X = data.drop([target, 'ID'], axis=1)

## Fix a baseline (mean age)

In [8]:
# Baseline
mean = np.mean(y)
print('mean_age = {}'.format(mean))
baseline_MSE = sum((mean - y)**2)/len(y)
print('baseline_MSE = {}'.format(baseline_MSE))

mean_age = 10.719744772251309
baseline_MSE = 13.723839851203406


# Explore different regressors and feature selection procedures

In [9]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## SelectKBest + SVR


In [10]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectKBest()),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.5, 0.8, 1, 1.5, 2],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__score_func' : [f_regression],
            'feat_select__k' : [80, 100, 120, 140, 160]}

In [11]:
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1)
# inner CV (model selection)
model = model.fit(X_train, y_train)

In [13]:
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )

Mean cross-validated score of the best estimator: 5.791717460813175
Selected hyperparameters: {'feat_select__k': 80, 'feat_select__score_func': <function f_regression at 0x7f7a0be2aea0>, 'regression__C': 0.5, 'regression__epsilon': 0.5, 'regression__kernel': 'rbf'}


In [14]:
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=6)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))

Split: 1Split: 2
Average expected test MSE: 6.700180382778985


In [15]:
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

True test error: 7.052733583368861


## ExtraTrees + SVR

In [20]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor(100))),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.5, 0.8, 1, 1.5, 2],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__estimator__n_estimators' : [100, 130],
            'feat_select__threshold' : [-np.inf],
            'feat_select__max_features' : [80, 100]
}

In [21]:
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1)
# inner CV (model selection)
model = model.fit(X_train, y_train)

In [23]:
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )

Mean cross-validated score of the best estimator: 5.27372739879798
Selected hyperparameters: {'feat_select__estimator__n_estimators': 100, 'feat_select__max_features': 100, 'feat_select__threshold': -inf, 'regression__C': 0.5, 'regression__epsilon': 0.3, 'regression__kernel': 'rbf'}


In [24]:
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=2)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))

Split: 2
Average expected test MSE: 5.907352805602215


In [25]:
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

True test error: 6.395435888429998


## ExtraTrees + RandomForest

In [41]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', RandomForestRegressor())
])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__estimator__n_estimators' : [50, 100],
              'feat_select__estimator__min_samples_leaf' : [1, 5, 8],
            'feat_select__threshold' : [-np.inf],
            'feat_select__max_features' : [100],
            'regression__max_depth' : [3, 6, 12],
            'regression__min_samples_leaf' : [1, 3, 5, 8]
}


In [42]:
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.0min finished


In [43]:
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )

Mean cross-validated score of the best estimator: 7.580203309009245
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 1, 'feat_select__estimator__n_estimators': 100, 'feat_select__max_features': 100, 'feat_select__threshold': -inf, 'regression__max_depth': 3, 'regression__min_samples_leaf': 3}


In [44]:
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))

Split: 5
Average expected test MSE: 7.114703042649384


In [45]:
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

True test error: 7.623688421484941


## ExtraTrees + XGBoost

In [74]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'feat_select__estimator__n_estimators' : [65],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 5, 8],
              'feat_select__threshold' : [-np.inf],
              'feat_select__max_features' : [20, 30, 50, 60, 70],
}


In [75]:
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False, n_jobs=-1, verbose = 1)
# inner CV (model selection)
model = model.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   51.4s finished




In [79]:
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )

Mean cross-validated score of the best estimator: 6.169314573457627
Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 8, 'feat_select__estimator__n_estimators': 65, 'feat_select__max_features': 20, 'feat_select__threshold': -inf}


In [80]:
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=5)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))


Average expected test MSE: 6.182130998847278


In [81]:
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

True test error: 6.827270787403901
