In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_kmo
import sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.svm import SVR, LinearSVR
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV, KFold, StratifiedKFold
from sklearn.feature_selection import RFECV, SelectKBest, f_regression, SelectFromModel
from sklearn.pipeline import Pipeline
from functools import reduce
import pickle
import itertools
from itertools import chain
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp as mc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
dem=pd.read_csv('../data/ketamine_endtreatment_hdrs.csv', index_col=[0])
dem_followup=pd.read_csv('../data/ketamine_followup_hdrs.csv', index_col=[0])
img=pd.read_csv('../data/baseline_rsfc-yeo17_diffusion.csv', index_col=[0])
df=dem.merge(img, on='screen_id')
df_followup=dem_followup.merge(img, on='screen_id')
df.shape

(60, 448)

## Factor analyze HDRS data
#### Compute KMO

In [3]:
def get_kmo(data):
    hdrs_for_fa=data[data.columns[data.columns.to_series().str.contains('baseline')]]
    kmo=calculate_kmo(hdrs_for_fa)
    print('iMSA: {} \n'.format(kmo[0]))
    print('Overall MSA: {}'.format(kmo[1]))
get_kmo(df)

iMSA: [0.62502512 0.57444776 0.46678683 0.66906508 0.48038619 0.30801207
 0.48403207 0.64859315 0.43608313 0.5628611  0.55868638 0.63590734
 0.48166926 0.49856915 0.59764058 0.50340844 0.53068733] 

Overall MSA: 0.5366922316301639




KMO scores are pretty low overall < 0.6 for viable solutions. So I can go ahead with the two-factor solution identified here and interpret with caution and/or add factors from other studies.

In [4]:
def get_factor_items(data, n_factors, threshold):
    hdrs_for_fa=data[data.columns[data.columns.to_series().str.contains('baseline')]]
    hdrs_change=data[data.columns[data.columns.to_series().str.contains('change')]]
    fa=FactorAnalyzer(n_factors=n_factors, rotation='oblimin', is_corr_matrix=False)
    fa.fit(hdrs_for_fa)
    loadings_bool=np.abs(fa.loadings_)>threshold
    items=[ hdrs_change.columns[loadings_bool[:,x]] for x in range(n_factors) ]
    return items

fitems=get_factor_items(df, n_factors=2, threshold=.3)
print(fitems[0])
print(fitems[1])

Index(['hamd_depressed_mood_change', 'hamd_guilt_change',
       'hamd_suicide_change', 'hamd_activities_change',
       'hamd_retardation_change', 'hamd_somsxs_gastro_change'],
      dtype='object')
Index(['hamd_insomnia_early_change', 'hamd_insomnia_middle_change',
       'hamd_anxiety_psychic_change', 'hamd_anxiety_somatic_change',
       'hamd_somsxs_general_change', 'hamd_hypochondriasis_change'],
      dtype='object')


In [5]:
def compute_factor_scores(data, items):
    tmp={}
    for i in range(len(items)):
        tmp['Factor_{}'.format(i+1)]=data[items[i]].sum(axis=1)
    tmp_df=pd.DataFrame.from_dict(tmp)
    return(tmp_df)

factor_df=compute_factor_scores(df, fitems)
factor_df_followup=compute_factor_scores(df_followup, fitems)

In [6]:
hdrs=pd.DataFrame({
    'hdrs_17_change': df[df.columns[df.columns.to_series().str.contains('change')]].sum(axis=1),
    'hdrs_6_change': df[['hamd_depressed_mood_change', 'hamd_guilt_change', 'hamd_activities_change','hamd_retardation_change', 'hamd_anxiety_psychic_change', 'hamd_somsxs_general_change']].sum(axis=1)    
})
hdrs=hdrs.merge(factor_df, right_index=True, left_index=True)

hdrs_followup=pd.DataFrame({
    'hdrs_17_change': df_followup[df_followup.columns[df_followup.columns.to_series().str.contains('change')]].sum(axis=1),
    'hdrs_6_change': df_followup[['hamd_depressed_mood_change', 'hamd_guilt_change', 'hamd_activities_change','hamd_retardation_change', 'hamd_anxiety_psychic_change', 'hamd_somsxs_general_change']].sum(axis=1), 
})
hdrs_followup=hdrs_followup.merge(factor_df_followup, right_index=True, left_index=True)
hdrs.head()

Unnamed: 0,hdrs_17_change,hdrs_6_change,Factor_1,Factor_2
0,-17,-8,-8,-8
1,-6,-5,-7,3
2,-21,-14,-9,-10
3,-12,-6,-7,-4
4,-5,-4,-5,0


In [7]:
X=df_followup[img.columns]
X=X.drop(['screen_id'], axis=1)

## Fit Models

In [8]:
imputer=SimpleImputer(missing_values=np.nan, strategy='median')

rf_mod=RandomForestRegressor(n_jobs=10, random_state=0)
rf_pipeline=Pipeline([('imputation', imputer), ('scale', StandardScaler()), ('selection', SelectKBest(f_regression)), ('random_forest', rf_mod)])

gb_mod=GradientBoostingRegressor(random_state=0)
gb_pipeline=Pipeline([('imputation', imputer), ('scale', StandardScaler()), ('selection', SelectKBest(f_regression)), ('gb_regressor', gb_mod)])

svm_mod=SVR(kernel='linear')
svm_pipeline=Pipeline([('imputation', imputer), ('scale', StandardScaler()), ('selection', SelectKBest(f_regression)), ('sv_regressor', svm_mod)])

pipelines=[rf_pipeline, gb_pipeline, svm_pipeline]
pipe_dict={0: 'RF', 1: 'GB', 2: 'SVM'}

In [9]:
rf_grid={'random_forest__n_estimators': [100, 500, 1000],
        'random_forest__max_depth': [2, 4, 6],
        'selection__k': [10, 20, 30]}

gb_grid={'gb_regressor__n_estimators': [25, 50, 100],
        'gb_regressor__learning_rate': [0.05, 0.1, 0.3],
        'gb_regressor__max_depth': [2, 4, 6],
        'gb_regressor__min_samples_split': [2, 4],
        'gb_regressor__min_samples_leaf': [1],
        'selection__k': [10, 20, 30]}

svr_grid={'sv_regressor__C': [0.01, 0.1, 1, 10],
         'selection__k': [10, 20, 30]}

parameter_grid_list=[rf_grid, gb_grid, svr_grid]

inner_cv = KFold(n_splits=10, shuffle=False, random_state=0)
outer_cv = KFold(n_splits=10, shuffle=False, random_state=0)

# clf = GridSearchCV(estimator=gb_pipeline, param_grid=gb_grid, cv=inner_cv)
# nested_score = cross_val_predict(clf, X=X, y=y, cv=outer_cv)
# print(r2_score(y_true=y, y_pred=nested_score))

In [10]:
results_by_outcome={}
for o, outcome in enumerate(hdrs): # this doesn't need to be enuerated
    y_current=hdrs_followup[outcome]
    predicted_dict={}
    for i, model in enumerate(pipelines):
        print('Processing {} model for outcome {}...'.format(pipe_dict[i], outcome))
        clf=GridSearchCV(estimator=model, param_grid=parameter_grid_list[i], cv=inner_cv)
        predicted=cross_val_predict(clf, X=X, y=y_current, cv=outer_cv)
        predicted_dict[pipe_dict[i]]=predicted
        print('{} R2: {:2f}; MSE: {:2f} \n'.format(pipe_dict[i], r2_score(y_true=y_current, y_pred=predicted), mean_squared_error(y_true=y_current, y_pred=predicted)))
    results_by_outcome[outcome]=predicted_dict

Processing RF model for outcome hdrs_17_change...
RF R2: -0.328800; MSE: 56.505426 

Processing GB model for outcome hdrs_17_change...
GB R2: -0.556631; MSE: 66.193630 

Processing SVM model for outcome hdrs_17_change...
SVM R2: -0.107103; MSE: 47.078066 

Processing RF model for outcome hdrs_6_change...
RF R2: -0.291869; MSE: 17.341145 

Processing GB model for outcome hdrs_6_change...
GB R2: -0.373380; MSE: 18.435287 

Processing SVM model for outcome hdrs_6_change...
SVM R2: -0.187898; MSE: 15.945507 

Processing RF model for outcome Factor_1...
RF R2: -0.223546; MSE: 16.519986 

Processing GB model for outcome Factor_1...
GB R2: -0.354867; MSE: 18.293052 

Processing SVM model for outcome Factor_1...
SVM R2: -0.197327; MSE: 16.165986 

Processing RF model for outcome Factor_2...
RF R2: 0.012350; MSE: 8.935555 

Processing GB model for outcome Factor_2...
GB R2: -0.070339; MSE: 9.683668 

Processing SVM model for outcome Factor_2...
SVM R2: 0.072339; MSE: 8.392814 



In [11]:
X.shape

(51, 411)

In [12]:
for o, outcome in enumerate(hdrs):
    print(o, outcome)
    if o==3:
        print(hdrs[outcome].head())

0 hdrs_17_change
1 hdrs_6_change
2 Factor_1
3 Factor_2
0    -8
1     3
2   -10
3    -4
4     0
Name: Factor_2, dtype: int64
