In [1]:
from os import path
import os
from glob import glob
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV, KFold, StratifiedKFold
from sklearn.feature_selection import RFECV, SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from functools import reduce
import pickle
import itertools
from itertools import chain
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp as mc
import matplotlib.pyplot as plt

In [2]:
base_path='/ifshome/bwade/NARSAD/Aim_1/'
data=pd.read_csv(base_path+'data/compiled_datasets/y-hdrs6_x-yeo17-diffusion-dem.csv')

In [3]:
def format_xy(dat, arm, demographics=False):
    data_reduced=dat[dat['arm']==arm]
    y=data_reduced['outcome']
    if demographics==False:
        x=data_reduced.drop(['Unnamed: 0', 'screen_id','age', 'sex', 'arm', 'outcome'], axis=1)
    else:
        x=data_reduced.drop(['Unnamed: 0', 'screen_id', 'outcome'], axis=1)
    return x, y

In [4]:
X,y=format_xy(dat=data, arm='k', demographics=False)

## Nested cross validation and tuning

In [5]:
imputer=SimpleImputer(missing_values=np.nan, strategy='median')

rf_mod=RandomForestRegressor(n_estimators=500)
rf_pipeline=Pipeline([('imputation', imputer), ('selection', SelectKBest(f_regression, k=30)), ('random_forest', rf_mod)])

gb_mod=GradientBoostingRegressor()
gb_pipeline=Pipeline([('imputation', imputer), ('selection', SelectKBest(f_regression, k=30)), ('gb_regressor', gb_mod)])

el_mod=ElasticNetCV(cv=10)
el_pipeline=Pipeline([('imputation', imputer), ('selection', SelectKBest(f_regression, k=30)), ('elastic_net', el_mod)])

svm_mod=SVR(kernel='linear')
svm_pipeline=Pipeline([('imputation', imputer), ('scale', StandardScaler()), ('selection', SelectKBest(f_regression, k=30)), ('sv_regressor', svm_mod)])

pipelines=[rf_pipeline, gb_pipeline, el_pipeline, svm_pipeline]
pipe_dict={0: 'RF', 1: 'GB', 2: 'EL', 3: 'SVM'}

In [6]:
rf_grid={'random_forest__n_estimators': [100, 500, 1000],
        'random_forest__max_depth': [2, 4, 6],
        'selection__k': [10, 20, 30]}

gb_grid={'gb_regressor__n_estimators': [25, 50, 100],
        'gb_regressor__learning_rate': [0.05, 0.1, 0.3],
        'gb_regressor__max_depth': [2, 4, 6],
        'gb_regressor__min_samples_split': [2, 4],
        'gb_regressor__min_samples_leaf': [1],
        'selection__k': [10, 20, 30]}

svr_grid={'sv_regressor__C': [0.01, 0.1, 1, 10],
         'selection__k': [10, 20, 30]}

el_grid={'elastic_net__n_alphas': [50, 100],
         'elastic_net__l1_ratio': [1, 0.5],
         'elastic_net__tol':[1e-2, 1e-3],
         'selection__k': [10, 20, 30]}

parameter_grid_list=[rf_grid, gb_grid, el_grid, svr_grid]

inner_cv = KFold(n_splits=10, shuffle=False, random_state=0)
outer_cv = KFold(n_splits=10, shuffle=False, random_state=0)

# clf = GridSearchCV(estimator=gb_pipeline, param_grid=gb_grid, cv=inner_cv)
# nested_score = cross_val_predict(clf, X=X, y=y, cv=outer_cv)
# print(r2_score(y_true=y, y_pred=nested_score))

In [7]:
predicted_dict={}

for i, model in enumerate(pipelines):
    print('Processing {} model...'.format(pipe_dict[i]))
    clf=GridSearchCV(estimator=model, param_grid=parameter_grid_list[i], cv=inner_cv)
    predicted=cross_val_predict(clf, X=X, y=y, cv=outer_cv)
    predicted_dict[pipe_dict[i]]=predicted
    print('{} R2: {:2f}; MSE: {:2f} \n'.format(pipe_dict[i], r2_score(y_true=y, y_pred=predicted), mean_squared_error(y_true=y, y_pred=predicted)))

Processing RF model...




RF R2: 0.146719; MSE: 13.600348 

Processing GB model...




GB R2: 0.236867; MSE: 12.163487 

Processing EL model...




EL R2: 0.108454; MSE: 14.210247 

Processing SVM model...




SVM R2: 0.038559; MSE: 15.324298 





In [None]:
mod_type='RF'
sns.regplot(x=y, y=predicted_dict[mod_type])
print(r2_score(y_true=y, y_pred=predicted_dict[mod_type]))
print(mean_absolute_error(y_true=y, y_pred=predicted_dict[mod_type]))
print(np.corrcoef(x=predicted_dict[mod_type], y=y)[0][1])

In [None]:
scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'explained_variance']
clf=GridSearchCV(estimator=gb_pipeline, param_grid=parameter_grid_list[1], cv=inner_cv)
fitted=cross_validate(clf, X, y, cv=outer_cv, scoring=scoring, return_estimator=True)

In [None]:
print(np.mean(fitted['test_neg_mean_squared_error']))
sns.boxplot(fitted['test_neg_mean_squared_error'])

1. Loop over treatment arms
2. Expand grid search space
3. Add method for cross-treatment predictions