# Age prediction from eeg features

In [19]:
# libraries
import matplotlib.pyplot as plt
import numpy as np
from utils import visualize, create_dataset_age, create_dataset_eeg, cv
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor
import seaborn as sns
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

## Create dataset

In [20]:
target = 'Age'
# use average and clusters eeg features
# consider all the patients
data = create_dataset_eeg(SCORE = target,  clusters = True)
# for the moment, remove the diagnosis colums
try:
    data.drop(columns=['DX_01_Cat'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01_Sub'], inplace=True)
except:
    pass
try:
    data.drop(columns=['DX_01'], inplace=True)
except:
    pass
# remove the quality-rating column, so we have only numeric data
data.drop(columns=['quality_rating'], inplace = True)

  if (await self.run_code(code, result,  async_=asy)):


In [21]:
# Drop the rows with some NaNs
print(data.shape)
data.dropna(axis=0, inplace=True)
print(data.shape)
column_names = data.columns[2:]

(787, 350)
(476, 350)


In [22]:
# labels and features
y = data[target]
X = data.drop([target, 'id'], axis=1)

## Fix a baseline (mean age)

In [23]:
# Baseline
mean = np.mean(y)
print('mean_age = {}'.format(mean))
baseline_MSE = sum((mean - y)**2)/len(y)
print('baseline_MSE = {}'.format(baseline_MSE))

mean_age = 10.64585212184874
baseline_MSE = 12.214027694284637


# Explore different regressors and feature selection procedures

In [24]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## ExtraTrees + SVR

In [25]:
pipe = Pipeline([ ('scaling', StandardScaler()),
  ('feat_select', SelectFromModel(ExtraTreesRegressor(100))),
  ('regression', SVR())
])
# Prepare sets of parameters for gridsearch
parameters = {'regression__C' : [0.5, 0.8, 1, 1.5, 2],
            'regression__epsilon' : [0.05, 0.1, 0.3, 0.5],
            'regression__kernel' : ['rbf'],
            'feat_select__estimator__n_estimators' : [100, 130],
            'feat_select__threshold' : [-np.inf],
            'feat_select__max_features' : [80, 100]
}

In [None]:
model = GridSearchCV(estimator=pipe, param_grid=parameters, scoring = make_scorer(mean_squared_error), cv=5,
                   iid=False)
# inner CV (model selection)
model = model.fit(X_train, y_train)

In [None]:
# see what has been chosen
print('Mean cross-validated score of the best estimator: {}'.format(model.best_score_)  )      
print('Selected hyperparameters: {}'.format(model.best_params_) )

In [None]:
# outer CV (model evaluation)
estimated_test_error = cv(model.best_estimator_, data=X_train, labels=y_train, n_splits=2)
print('\nAverage expected test MSE: {}'.format(np.mean(estimated_test_error)))

In [18]:
# effective test MSE
y_pred = model.best_estimator_.predict(X_test)
print('True test error: {}'.format(mean_squared_error(y_pred, y_test)))

True test error: 7.092298281442232
