
# Age prediction on healthy patients

In [1]:
import warnings
warnings.filterwarnings('ignore')
from utils import create_dataset_mri, cv_for_cde, create_dataset_eeg
from cde.density_estimator import MixtureDensityNetwork
import numpy as np
import tensorflow as tf
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from tensorflow.python.keras.activations import tanh
from sklearn.impute import SimpleImputer
import pandas as pd
# libraries
import matplotlib.pyplot as plt
import numpy as np
from utils import visualize, create_dataset_age, create_dataset_eeg, cv, create_dataset_mri
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
import seaborn as sns
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.impute import SimpleImputer
import xgboost as xgb
import pandas as pd
from sklearn.linear_model import Lasso


Instructions for updating:
Use the retry module or similar alternatives.


# MRI+EEG

In [2]:

target = 'Age'
data_mri = create_dataset_mri()
data_eeg = create_dataset_eeg(clusters = True)
data_eeg = data_eeg.rename(columns={'id': 'ID'})
# Mixed dataset eeg + mri
data = pd.merge(data_mri, data_eeg, on=['ID', 'Age', 'DX_01', 'DX_01_Cat', 'DX_01_Sub'], how='inner')
data = data[data['DX_01']=='No Diagnosis Given']
test = data.sample(frac = 0.5)
id_test = test['ID']
test = data[data['ID'].isin(id_test)]
train = data[~data['ID'].isin(id_test)]
train.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01', 'ID'], inplace=True)
test.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01', 'ID'], inplace=True)
train = np.array(train)
test = np.array(test)
y_train = train[:, 0]
y_test = test[:, 0]
X_train = train[:,1:]
X_test = test[:,1:]
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

In [3]:
# Impute missing values
imp = SimpleImputer(strategy = 'median')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

In [4]:
# Set model parameters
ndim_x=X_train.shape[1]
ndim_y=y_train.shape[1]
# We try the "faster decay rate for non-gaussian data" proposed in the paper: h = n^(-1/(d+1))
n = X_train.shape[0]
d = X_train.shape[1]+y_train.shape[1]
h = n**(-1/(d+1))
# Define the model
model = MixtureDensityNetwork('MDNe', ndim_x, ndim_y, n_centers=10, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh,
               n_training_epochs=100, x_noise_std=h, y_noise_std=h, adaptive_noise_fn=None, entropy_reg_coef=0.0,
               weight_decay=0.0, weight_normalization=True, data_normalization=True, dropout=0.0, l2_reg=0.0, l1_reg=0.0,
               random_seed=42)
# Fit
model.fit(X_train, y_train)
# Predict
y_pred = model.mean_(X_test)
y_pred = y_pred.reshape((-1,1))
y_pred.shape
print('Test MSE: {}'.format(mean_squared_error(y_pred, y_test)))

100/100 [100%] ██████████████████████████████ Elapsed: 3s | loss: 90.498
mean log-loss train: 1.5877
Test MSE: 6.237250908371777


In [5]:
(mse, r2, mae) = cv_for_cde(X_train, y_train.flatten(), 'mixed', h, n_splits=5, want_r2 = True, want_mae = True, hidden_sizes = (16,16))
print('mse: {}'.format(np.mean(mse)))
print('r2: {}'.format(np.mean(r2)))
print('mae: {}'.format(np.mean(mae)))

1000/1000 [100%] ██████████████████████████████ Elapsed: 10s | loss: 57.455
mean log-loss train: 1.2768
MSE: 4.650842368031215
1000/1000 [100%] ██████████████████████████████ Elapsed: 11s | loss: 71.046
mean log-loss train: 1.5788
MSE: 12.450846516802828
1000/1000 [100%] ██████████████████████████████ Elapsed: 10s | loss: 59.991
mean log-loss train: 1.3041
MSE: 3.9370605093811544
1000/1000 [100%] ██████████████████████████████ Elapsed: 11s | loss: 62.055
mean log-loss train: 1.3490
MSE: 6.3780274109236395
1000/1000 [100%] ██████████████████████████████ Elapsed: 11s | loss: 56.059
mean log-loss train: 1.2187
MSE: 8.207860183240546
mse: 7.124927397675878
r2: 0.28564651519818246
mae: 2.1317628992387254


In [6]:
y_train = y_train.reshape(y_train.shape[0])
y_test = y_test.reshape(y_test.shape[0])
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['median'],
              'feat_select__estimator__n_estimators' : [100],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 5, 8],
              'regression__booster__alpha' : [0.1],
              'regression__booster__max_depth' : [3, 5, 6]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring =  'neg_mean_squared_error', cv=5,
                   iid=False, n_jobs=-1, verbose = 1)

# Nested CV (model evaluation)
(estimated_test_error, r2, mae) = cv(model, data=X_train, labels=y_train, n_splits=5, want_r2 = True, want_mae = True)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
print('\nAverage expected test r2: {}'.format(np.mean(r2)))
print('\nAverage expected test mae: {}'.format(np.mean(mae)))
# effective test MSE
model_fitted = model.fit(X_train, y_train)
# see what has been chosen    
print('Selected hyperparameters: {}'.format(model_fitted.best_params_) )
y_pred = model_fitted.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))
print('Test r2: {}'.format(r2_score(y_true= y_test, y_pred = y_pred)))
print('Test mae: {}'.format(mean_absolute_error(y_true= y_test, y_pred = y_pred)))

Split: 1Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.0s finished


Split: 2Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.9s finished


Split: 3Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Split: 4Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Split: 5Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.5s finished



Average expected test MSE: 6.861382178406624

Average expected test r2: 0.23772676350714902

Average expected test mae: 2.0367252960452458
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.9s finished


Selected hyperparameters: {'feat_select__estimator__min_samples_leaf': 5, 'feat_select__estimator__n_estimators': 100, 'imputation__strategy': 'median', 'regression__booster__alpha': 0.1, 'regression__booster__max_depth': 5}
True test error: 7.0381527226557985
Test r2: 0.46049074990444305
Test mae: 2.0276507730633964


# MRI only

In [7]:
target = 'Age'
data = create_dataset_mri()
data = data[data['DX_01']=='No Diagnosis Given']
test = data[data['ID'].isin(id_test)]
train = data[~data['ID'].isin(id_test)]
train.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01', 'ID'], inplace=True)
test.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01', 'ID'], inplace=True)
train = np.array(train)
test = np.array(test)
y_train = train[:, 0]
y_test = test[:, 0]
X_train = train[:,1:]
X_test = test[:,1:]
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

In [8]:
target = 'Age'
data = create_dataset_mri()
data = data[data['DX_01']=='No Diagnosis Given']

data.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01', 'ID'], inplace=True)


In [9]:
data = np.array(data)
y = data[:, 0]
X = data[:,1:]
y = y.reshape((-1,1))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=8)

In [10]:
# Set model parameters
ndim_x=X_train.shape[1]
ndim_y=y_train.shape[1]
# We try the "faster decay rate for non-gaussian data" proposed in the paper: h = n^(-1/(d+1))
n = X_train.shape[0]
d = X_train.shape[1]+y_train.shape[1]
h = n**(-1/(d+1))
# Define the model
model = MixtureDensityNetwork('MDNmri', ndim_x, ndim_y, n_centers=10, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh,
               n_training_epochs=100, x_noise_std=h, y_noise_std=h, adaptive_noise_fn=None, entropy_reg_coef=0.0,
               weight_decay=0.0, weight_normalization=True, data_normalization=True, dropout=0.0, l2_reg=0.0, l1_reg=0.0,
               random_seed=42)
# Fit
model.fit(X_train, y_train)
# Predict
y_pred = model.mean_(X_test)
y_pred = y_pred.reshape((-1,1))
y_pred.shape
print('Test MSE: {}'.format(mean_squared_error(y_pred, y_test)))

100/100 [100%] ██████████████████████████████ Elapsed: 5s | loss: 112.638
mean log-loss train: 1.5221
Test MSE: 4.311100126053298


In [11]:
(mse, r2, mae) = cv_for_cde(X_train, y_train.flatten(), 'mixed99', h, n_splits=5, want_r2 = True, want_mae = True, hidden_sizes = (16,16))
print('mse: {}'.format(np.mean(mse)))
print('r2: {}'.format(np.mean(r2)))
print('mae: {}'.format(np.mean(mae)))

1000/1000 [100%] ██████████████████████████████ Elapsed: 12s | loss: 85.471
mean log-loss train: 1.4487
MSE: 6.0214861031148645
1000/1000 [100%] ██████████████████████████████ Elapsed: 13s | loss: 87.405
mean log-loss train: 1.4814
MSE: 3.3603505652523964
1000/1000 [100%] ██████████████████████████████ Elapsed: 15s | loss: 81.520
mean log-loss train: 1.3817
MSE: 7.147658304742312
1000/1000 [100%] ██████████████████████████████ Elapsed: 14s | loss: 75.673
mean log-loss train: 1.2826
MSE: 5.539315721604884
1000/1000 [100%] ██████████████████████████████ Elapsed: 16s | loss: 86.652
mean log-loss train: 1.4442
MSE: 2.5198218287145084
mse: 4.917726504685793
r2: 0.6041770882057382
mae: 1.764228646330014


In [None]:
y_train = y_train.reshape(y_train.shape[0])
y_test = y_test.reshape(y_test.shape[0])
pipe = Pipeline([ ('imputation', SimpleImputer()), ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor())),
  ('regression', xgb.XGBRegressor())])
# Prepare sets of parameters for gridsearch
parameters = {'imputation__strategy' : ['median'],
              'feat_select__estimator__n_estimators' : [100],
              'feat_select__estimator__min_samples_leaf' : [1, 2, 5, 8],
              'regression__booster__alpha' : [0.1],
              'regression__booster__max_depth' : [3, 5, 6]
}
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring =  'neg_mean_squared_error', cv=5,
                   iid=False, n_jobs=-1, verbose = 1)

# Nested CV (model evaluation)
(estimated_test_error, r2, mae) = cv(model, data=X_train, labels=y_train, n_splits=5, want_r2 = True, want_mae = True)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))
print('\nAverage expected test r2: {}'.format(np.mean(r2)))
print('\nAverage expected test mae: {}'.format(np.mean(mae)))
# effective test MSE
model_fitted = model.fit(X_train, y_train)
# see what has been chosen    
print('Selected hyperparameters: {}'.format(model_fitted.best_params_) )
y_pred = model_fitted.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))
print('Test r2: {}'.format(r2_score(y_true= y_test, y_pred = y_pred)))
print('Test mae: {}'.format(mean_absolute_error(y_true= y_test, y_pred = y_pred)))

Split: 1Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
