In [None]:
import pandas as pd

import numpy as np

from scipy import stats

import os

import matplotlib.pyplot as plt

import re

In [None]:
pericardialDataFile = os.path.join('data','UKB_pericardial_fat_predictions.csv')

if not os.path.isfile(pericardialDataFile):
    print('you should probably run predict_UKB_fat_areas.ipynb first')
else:
    pericardialArea = pd.read_csv(pericardialDataFile,index_col=0).dropna().set_index('f.eid')
    
#filter for those with good segmentation performance

BadQC = pericardialArea['predicted DSC']<0.6
GoodQC = pericardialArea['predicted DSC']>=0.6

print('Areas calculated for ' + str(pericardialArea.shape[0]) +' participants. ' + str(BadQC.sum()) + ' removed by QC, leaving ' + str(GoodQC.sum()))

pericardialArea = pericardialArea.loc[GoodQC,:]

In [None]:
#stuff for loading other data to examine...
DATAFOLDER = os.path.join('data','by_udi_decoded') #assume a correct symlink has been inserted here

def get_variable(udi):
    '''takes a numeric udi and return the data associated as a pandas dataframe'''
    
    filename = os.path.join(DATAFOLDER,'f.' + str(udi) + '_decoded.tab')
    
    return pd.read_csv(filename,index_col = 'f.eid',delimiter = '\t',low_memory=False)

In [None]:
# education = get_variable(6138)

# def get_max_education(row):
    
#     if 'College or University degree' in row.values:
#         return 'University'
#     elif 'NVQ or HND or HNC or equivalent' in row.values:
#         return 'NVQ'
#     elif 'A levels/AS levels or equivalent' in row.values:
#         return 'A-levels'
#     elif 'O levels/GCSEs or equivalent' in row.values or 'CSEs or equivalent' in row.values:
#         return 'GCSE'
#     elif 'None of the above' in row.values:
#         return 'None'
    
# maxEducation = education.apply(get_max_education,axis=1).rename('Max Education')

So, lets do some LR for diabetes and look at whether PAT gives extra predictive value.

In [None]:
ageAtImaging = get_variable(21003).loc[:,'f.21003.2.0'].loc[pericardialArea.index].rename('Age (years)')

ageDiabetesDiagnosed = get_variable(2976).apply(lambda x: pd.to_numeric(x,errors='coerce') ).min(axis=1).loc[pericardialArea.index]

hba1c = get_variable(30750).max(axis=1,skipna=True).loc[pericardialArea.index]
anyDiabetesDiagnosed = (get_variable(2443) == 'Yes').max(axis = 1).loc[pericardialArea.index].rename('diabetes')
#all criteria for ACTUAL DIABETES
diabetes = pd.Series(data = np.logical_or.reduce((hba1c>48,anyDiabetesDiagnosed,ageDiabetesDiagnosed<=ageAtImaging)),index = pericardialArea.index).rename('diabetes')

bmi = get_variable(21001).max(axis=1).rename('BMI').loc[pericardialArea.index].rename('BMI')
sex = (get_variable(31).loc[pericardialArea.index].iloc[:,0] == 'Male').rename('Male')

pericardialArea = pd.concat((pericardialArea,ageAtImaging,bmi,sex,diabetes),axis=1).dropna()

In [None]:
plt.hist(pericardialArea['meanArea (cm2)'])

In [None]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# from sklearn.metrics import roc_curve
# from sklearn.metrics import auc


yVar = 'diabetes'

#divide the area by 10 so the odds ratios look bigger
pericardialArea.loc[:,'meanArea (cm2)'] = pericardialArea['meanArea (cm2)']/10


y = pericardialArea[yVar].astype(bool)

x1 = pericardialArea['meanArea (cm2)']
# x1 = pericardialArea.loc[:,('Male','Age (years)','BMI')]
x2 = pericardialArea.loc[:,('meanArea (cm2)','Male','Age (years)')]
x3 = pericardialArea.loc[:,('meanArea (cm2)','Male','Age (years)','BMI')]

x_univariate = add_constant(x1.values.astype(float))
x_multivariate = add_constant(x2.values.astype(float))
x_multivariateB = add_constant(x3.values.astype(float))


mu = sm.Logit(y.values,x_univariate).fit()
mm = sm.Logit(y.values,x_multivariate).fit()
mmB = sm.Logit(y.values,x_multivariateB).fit()



print('univariate model: x1 = PAT area')
print(mu.summary())
print('multivariate model: x1 = PAT area, ' + ', '.join(['x' + str(n+2) +' = '+name  for n,name in enumerate(x2.columns)]))
print(mm.summary())
print('multivariate model with BMI: x1 = PAT area, ' + ', '.join(['x' + str(n+2) +' = '+name  for n,name in enumerate(x3.columns)]))
print(mmB.summary())



multiResults = pd.DataFrame()

for ind,var in enumerate(['offset','PAT Area (cm2)','Male?','Age (years)']):
        
    multiResults.loc[var,'Odds ratio'] = f'{np.exp(mm.params[ind]):.04g}'

    multiResults.loc[var,'Odds ratio 95% CI'] = ', '.join([f'{x:.04g}' for x in np.exp(mm.conf_int()[ind,:])])
    
    multiResults.loc[var,'p'] = mm.pvalues[ind]


    
    
multiBMIResults = pd.DataFrame()

for ind,var in enumerate(['offset','PAT Area (cm2)','Male?','Age (years)','BMI']):
        
    multiBMIResults.loc[var,'Odds ratio'] = f'{np.exp(mmB.params[ind]):.04g}'

    multiBMIResults.loc[var,'Odds ratio 95% CI'] = ', '.join([f'{x:.04g}' for x in np.exp(mmB.conf_int()[ind,:])])
    
    multiBMIResults.loc[var,'p'] = mmB.pvalues[ind]

    
    
uniResults = pd.DataFrame()

for ind,var in enumerate(['offset','PAT Area (cm2)']):
        
    uniResults.loc[var,'Odds ratio'] = f'{np.exp(mu.params[ind]):.04g}'

    uniResults.loc[var,'Odds ratio 95% CI'] = ', '.join([f'{x:.04g}' for x in np.exp(mu.conf_int()[ind,:])])
    
    uniResults.loc[var,'p'] = f'{mu.pvalues[ind]:.04g}'


uniResults.to_csv(os.path.join('graphs','UKB_diabetes_univariate_LR.csv'))
multiResults.to_csv(os.path.join('graphs','UKB_diabetes_multivariate_LR.csv'))
multiBMIResults.to_csv(os.path.join('graphs','UKB_diabetes_multivariate_withBMI_LR.csv'))
    

In [None]:
plt.hist(pericardialArea['meanArea (cm2)'])

In [None]:
multiBMIResults

In [None]:
multiResults

In [None]:
plt.figure(figsize = (6,6))

nFolds = 5
randomSeed = 42
#instantiate k-folds classification
splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

auc1 = []
auc2 = []
interp_tpr1s = []
interp_tpr2s = []

mean_fpr = np.linspace(0, 1, 100)

for i,(train,test) in enumerate(splitter.split(x1,y)):    

    #instantiate classifiers
    lr1 = LogisticRegression()
    lr2 = LogisticRegression()

    #fit classifiers with the two different datasets
    lr1.fit(X=x1.iloc[train].values.reshape(-1,1),y=y.iloc[train])
    lr2.fit(X=x2.iloc[train],y=y.iloc[train])

    #scores        
    sc1 = lr1.decision_function(x1.iloc[test].values.reshape(-1,1))
    sc2 = lr2.decision_function(x2.iloc[test])

    #ROC values
    fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
    fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

    #ROC AUC values
    auc1.append(auc(fpr1,tpr1))
    auc2.append(auc(fpr2,tpr2))

    #interpolate so that we can show error bars..
    interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
    interp_tpr1[0] = 0.0
    interp_tpr1s.append(interp_tpr1)

    interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
    interp_tpr2[0] = 0.0
    interp_tpr2s.append(interp_tpr2)

#overlaid ROC mean and stdev

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
        label='Chance', alpha=.8)

mean_tpr = np.mean(interp_tpr1s, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(auc1)
plt.plot(mean_fpr, mean_tpr, #color='b',
        label=r'Univariate model, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8,color='C0')

std_tpr = np.std(interp_tpr1s, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
                )
mean_tpr = np.mean(interp_tpr2s, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(auc2)
plt.plot(mean_fpr, mean_tpr, #color='',
        label=r'Multivariate model, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8,color='C1')

std_tpr = np.std(interp_tpr2s, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
                )
plt.legend(loc='lower right')
plt.xlim([0,1])
plt.ylim([0,1])

plt.xlabel('False positive rate (1 - specificity)')
plt.ylabel('True positive rate (sensitivity)')

plt.savefig(os.path.join('graphs','UKB_diabetes_LR_ROC_curves.png'))
plt.savefig(os.path.join('graphs','UKB_diabetes_LR_ROC_curves.svg'))

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# names = ["Logistic Regression",
#          "Nearest Neighbors", 
#          "Linear SVM", 
#          "RBF SVM", 
# #          "Gaussian Process",
#          "Decision Tree", 
#          "Random Forest", 
#          "Neural Net", 
#          "AdaBoost",
#          "Naive Bayes", 
#          "QDA"]

# nModels = len(names)

# x1 = pericardialArea.loc[:,('Male','Age (years)','BMI')]
# x2 = pericardialArea.loc[:,('Male','Age (years)','BMI','meanArea (cm2)')]

# randomSeed = 42
# nFolds = 5

In [None]:
# #blank classifiers..
# classifiers = [LogisticRegression(class_weight='balanced',n_jobs=4),
#                KNeighborsClassifier(3),
#                SVC(kernel="linear", C=0.025),
#                SVC(gamma=2, C=1),
# #                GaussianProcessClassifier(1.0 * RBF(1.0),n_jobs=4),
#                DecisionTreeClassifier(max_depth=5),
#                RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1,n_jobs=4),
#                MLPClassifier(alpha=1, max_iter=1000),
#                AdaBoostClassifier(),
#                GaussianNB(),
#                QuadraticDiscriminantAnalysis()]

# import copy

In [None]:
# yVar = 'athHD'

# y = pericardialArea[yVar].astype(bool)

# plt.figure(figsize = (15,nModels*5))
# print(yVar)
# for ind,name in enumerate(names):
#     print(name)

#     #instantiate k-folds classification
#     splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

#     auc1 = []
#     auc2 = []
#     interp_tpr1s = []
#     interp_tpr2s = []

#     mean_fpr = np.linspace(0, 1, 100)

#     for i,(train,test) in enumerate(splitter.split(x1,y)):    

#         #instantiate classifiers
#         lr1 = copy.copy(classifiers[ind])
#         lr2 = copy.copy(classifiers[ind])

#         #fit classifiers with the two different datasets
#         lr1.fit(X=x1.iloc[train],y=y.iloc[train])
#         lr2.fit(X=x2.iloc[train],y=y.iloc[train])

#         #scores        
#         if hasattr(lr1, "decision_function"):
#             sc1 = lr1.decision_function(x1.iloc[test])
#             sc2 = lr2.decision_function(x2.iloc[test])
#         else:
#             sc1 = lr1.predict_proba(x1.iloc[test])[:,1]
#             sc2 = lr2.predict_proba(x2.iloc[test])[:,1]

#         #ROC values
#         fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
#         fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

#         #ROC AUC values
#         auc1.append(auc(fpr1,tpr1))
#         auc2.append(auc(fpr2,tpr2))
    
#         #interpolate so that we can show error bars..
#         interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
#         interp_tpr1[0] = 0.0
#         interp_tpr1s.append(interp_tpr1)

#         interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
#         interp_tpr2[0] = 0.0
#         interp_tpr2s.append(interp_tpr2)

#     # subplot for overlaid ROC mean and stdev
#     plt.subplot(nModels,3,ind*3 + 1)

#     plt.title(name + ' ' + yVar)
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
#             label='Chance', alpha=.8)

#     mean_tpr = np.mean(interp_tpr1s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc1)
#     plt.plot(mean_fpr, mean_tpr, #color='b',
#             label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C0')

#     std_tpr = np.std(interp_tpr1s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
#                     )
#     mean_tpr = np.mean(interp_tpr2s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc2)
#     plt.plot(mean_fpr, mean_tpr, #color='',
#             label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C1')

#     std_tpr = np.std(interp_tpr2s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
#                     )
#     plt.legend()
#     plt.xlim([0,1])
#     plt.ylim([0,1])
    
#     #histogram of mean AUCs
#     plt.subplot(nModels,3,ind*3 + 2)
#     plt.hist(auc1,alpha=0.5,label = 'no PAT')
#     plt.hist(auc2,alpha=0.5,label = 'with PAT feature')

    
#     #scatter plot for AUC with line of unity
#     plt.subplot(nModels,3,ind*3 + 3)
#     plt.scatter(auc1,auc2)
#     # plt.axis('equal')
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
#     minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
#     maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
#     plt.xlim([minAx,maxAx])
#     plt.ylim([minAx,maxAx])
    
# plt.savefig(os.path.join('.','graphs','cheat ' +yVar + '.png'))

In [None]:
# yVar = 'atherosclerosis'

# y = pericardialArea[yVar].astype(bool)

# plt.figure(figsize = (15,nModels*5))
# print(yVar)
# for ind,name in enumerate(names):
#     print(name)

#     #instantiate k-folds classification
#     splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

#     auc1 = []
#     auc2 = []
#     interp_tpr1s = []
#     interp_tpr2s = []

#     mean_fpr = np.linspace(0, 1, 100)

#     for i,(train,test) in enumerate(splitter.split(x1,y)):    

#         #instantiate classifiers
#         lr1 = copy.copy(classifiers[ind])
#         lr2 = copy.copy(classifiers[ind])

#         #fit classifiers with the two different datasets
#         lr1.fit(X=x1.iloc[train],y=y.iloc[train])
#         lr2.fit(X=x2.iloc[train],y=y.iloc[train])

#         #scores        
#         if hasattr(lr1, "decision_function"):
#             sc1 = lr1.decision_function(x1.iloc[test])
#             sc2 = lr2.decision_function(x2.iloc[test])
#         else:
#             sc1 = lr1.predict_proba(x1.iloc[test])[:,1]
#             sc2 = lr2.predict_proba(x2.iloc[test])[:,1]

#         #ROC values
#         fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
#         fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

#         #ROC AUC values
#         auc1.append(auc(fpr1,tpr1))
#         auc2.append(auc(fpr2,tpr2))
    
#         #interpolate so that we can show error bars..
#         interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
#         interp_tpr1[0] = 0.0
#         interp_tpr1s.append(interp_tpr1)

#         interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
#         interp_tpr2[0] = 0.0
#         interp_tpr2s.append(interp_tpr2)

#     # subplot for overlaid ROC mean and stdev
#     plt.subplot(nModels,3,ind*3 + 1)

#     plt.title(name + ' ' + yVar)
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
#             label='Chance', alpha=.8)

#     mean_tpr = np.mean(interp_tpr1s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc1)
#     plt.plot(mean_fpr, mean_tpr, #color='b',
#             label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C0')

#     std_tpr = np.std(interp_tpr1s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
#                     )
#     mean_tpr = np.mean(interp_tpr2s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc2)
#     plt.plot(mean_fpr, mean_tpr, #color='',
#             label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C1')

#     std_tpr = np.std(interp_tpr2s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
#                     )
#     plt.legend()
#     plt.xlim([0,1])
#     plt.ylim([0,1])
    
#     #histogram of mean AUCs
#     plt.subplot(nModels,3,ind*3 + 2)
#     plt.hist(auc1,alpha=0.5,label = 'no PAT')
#     plt.hist(auc2,alpha=0.5,label = 'with PAT feature')

    
#     #scatter plot for AUC with line of unity
#     plt.subplot(nModels,3,ind*3 + 3)
#     plt.scatter(auc1,auc2)
#     # plt.axis('equal')
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
#     minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
#     maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
#     plt.xlim([minAx,maxAx])
#     plt.ylim([minAx,maxAx])
    
# plt.savefig(os.path.join('.','graphs','cheat ' +yVar + '.png'))

In [None]:
# yVar = 'diabetes'

# y = pericardialArea[yVar].astype(bool)

# plt.figure(figsize = (15,nModels*5))
# print(yVar)
# for ind,name in enumerate(names):
#     print(name)

#     #instantiate k-folds classification
#     splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

#     auc1 = []
#     auc2 = []
#     interp_tpr1s = []
#     interp_tpr2s = []

#     mean_fpr = np.linspace(0, 1, 100)

#     for i,(train,test) in enumerate(splitter.split(x1,y)):    

#         #instantiate classifiers
#         lr1 = copy.copy(classifiers[ind])
#         lr2 = copy.copy(classifiers[ind])

#         #fit classifiers with the two different datasets
#         lr1.fit(X=x1.iloc[train],y=y.iloc[train])
#         lr2.fit(X=x2.iloc[train],y=y.iloc[train])

#         #scores        
#         if hasattr(lr1, "decision_function"):
#             sc1 = lr1.decision_function(x1.iloc[test])
#             sc2 = lr2.decision_function(x2.iloc[test])
#         else:
#             sc1 = lr1.predict_proba(x1.iloc[test])[:,1]
#             sc2 = lr2.predict_proba(x2.iloc[test])[:,1]

#         #ROC values
#         fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
#         fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

#         #ROC AUC values
#         auc1.append(auc(fpr1,tpr1))
#         auc2.append(auc(fpr2,tpr2))
    
#         #interpolate so that we can show error bars..
#         interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
#         interp_tpr1[0] = 0.0
#         interp_tpr1s.append(interp_tpr1)

#         interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
#         interp_tpr2[0] = 0.0
#         interp_tpr2s.append(interp_tpr2)

#     # subplot for overlaid ROC mean and stdev
#     plt.subplot(nModels,3,ind*3 + 1)

#     plt.title(name + ' ' + yVar)
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
#             label='Chance', alpha=.8)

#     mean_tpr = np.mean(interp_tpr1s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc1)
#     plt.plot(mean_fpr, mean_tpr, #color='b',
#             label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C0')

#     std_tpr = np.std(interp_tpr1s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
#                     )
#     mean_tpr = np.mean(interp_tpr2s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc2)
#     plt.plot(mean_fpr, mean_tpr, #color='',
#             label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C1')

#     std_tpr = np.std(interp_tpr2s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
#                     )
#     plt.legend()
#     plt.xlim([0,1])
#     plt.ylim([0,1])
    
#     #histogram of mean AUCs
#     plt.subplot(nModels,3,ind*3 + 2)
#     plt.hist(auc1,alpha=0.5,label = 'no PAT')
#     plt.hist(auc2,alpha=0.5,label = 'with PAT feature')

    
#     #scatter plot for AUC with line of unity
#     plt.subplot(nModels,3,ind*3 + 3)
#     plt.scatter(auc1,auc2)
#     # plt.axis('equal')
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
#     minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
#     maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
#     plt.xlim([minAx,maxAx])
#     plt.ylim([minAx,maxAx])
    
# plt.savefig(os.path.join('.','graphs','cheat ' +yVar + '.png'))

In [None]:
# yVar = 'myocardial infarction'


# y = pericardialArea[yVar].astype(bool)

# plt.figure(figsize = (15,nModels*5))
# print(yVar)
# for ind,name in enumerate(names):
#     print(name)

#     #instantiate k-folds classification
#     splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

#     auc1 = []
#     auc2 = []
#     interp_tpr1s = []
#     interp_tpr2s = []

#     mean_fpr = np.linspace(0, 1, 100)

#     for i,(train,test) in enumerate(splitter.split(x1,y)):    

#         #instantiate classifiers
#         lr1 = copy.copy(classifiers[ind])
#         lr2 = copy.copy(classifiers[ind])

#         #fit classifiers with the two different datasets
#         lr1.fit(X=x1.iloc[train],y=y.iloc[train])
#         lr2.fit(X=x2.iloc[train],y=y.iloc[train])

#         #scores        
#         if hasattr(lr1, "decision_function"):
#             sc1 = lr1.decision_function(x1.iloc[test])
#             sc2 = lr2.decision_function(x2.iloc[test])
#         else:
#             sc1 = lr1.predict_proba(x1.iloc[test])[:,1]
#             sc2 = lr2.predict_proba(x2.iloc[test])[:,1]

#         #ROC values
#         fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
#         fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

#         #ROC AUC values
#         auc1.append(auc(fpr1,tpr1))
#         auc2.append(auc(fpr2,tpr2))
    
#         #interpolate so that we can show error bars..
#         interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
#         interp_tpr1[0] = 0.0
#         interp_tpr1s.append(interp_tpr1)

#         interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
#         interp_tpr2[0] = 0.0
#         interp_tpr2s.append(interp_tpr2)

#     # subplot for overlaid ROC mean and stdev
#     plt.subplot(nModels,3,ind*3 + 1)

#     plt.title(name + ' ' + yVar)
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
#             label='Chance', alpha=.8)

#     mean_tpr = np.mean(interp_tpr1s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc1)
#     plt.plot(mean_fpr, mean_tpr, #color='b',
#             label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C0')

#     std_tpr = np.std(interp_tpr1s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
#                     )
#     mean_tpr = np.mean(interp_tpr2s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc2)
#     plt.plot(mean_fpr, mean_tpr, #color='',
#             label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C1')

#     std_tpr = np.std(interp_tpr2s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
#                     )
#     plt.legend()
#     plt.xlim([0,1])
#     plt.ylim([0,1])
    
#     #histogram of mean AUCs
#     plt.subplot(nModels,3,ind*3 + 2)
#     plt.hist(auc1,alpha=0.5,label = 'no PAT')
#     plt.hist(auc2,alpha=0.5,label = 'with PAT feature')

    
#     #scatter plot for AUC with line of unity
#     plt.subplot(nModels,3,ind*3 + 3)
#     plt.scatter(auc1,auc2)
#     # plt.axis('equal')
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
#     minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
#     maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
#     plt.xlim([minAx,maxAx])
#     plt.ylim([minAx,maxAx])
    
# plt.savefig(os.path.join('.','graphs','cheat ' +yVar + '.png'))

In [None]:
# yVar = 'atrial fibrillation'


# y = pericardialArea[yVar].astype(bool)

# plt.figure(figsize = (15,nModels*5))
# print(yVar)
# for ind,name in enumerate(names):
#     print(name)

#     #instantiate k-folds classification
#     splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

#     auc1 = []
#     auc2 = []
#     interp_tpr1s = []
#     interp_tpr2s = []

#     mean_fpr = np.linspace(0, 1, 100)

#     for i,(train,test) in enumerate(splitter.split(x1,y)):    

#         #instantiate classifiers
#         lr1 = copy.copy(classifiers[ind])
#         lr2 = copy.copy(classifiers[ind])

#         #fit classifiers with the two different datasets
#         lr1.fit(X=x1.iloc[train],y=y.iloc[train])
#         lr2.fit(X=x2.iloc[train],y=y.iloc[train])

#         #scores        
#         if hasattr(lr1, "decision_function"):
#             sc1 = lr1.decision_function(x1.iloc[test])
#             sc2 = lr2.decision_function(x2.iloc[test])
#         else:
#             sc1 = lr1.predict_proba(x1.iloc[test])[:,1]
#             sc2 = lr2.predict_proba(x2.iloc[test])[:,1]

#         #ROC values
#         fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
#         fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

#         #ROC AUC values
#         auc1.append(auc(fpr1,tpr1))
#         auc2.append(auc(fpr2,tpr2))
    
#         #interpolate so that we can show error bars..
#         interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
#         interp_tpr1[0] = 0.0
#         interp_tpr1s.append(interp_tpr1)

#         interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
#         interp_tpr2[0] = 0.0
#         interp_tpr2s.append(interp_tpr2)

#     # subplot for overlaid ROC mean and stdev
#     plt.subplot(nModels,3,ind*3 + 1)

#     plt.title(name + ' ' + yVar)
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
#             label='Chance', alpha=.8)

#     mean_tpr = np.mean(interp_tpr1s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc1)
#     plt.plot(mean_fpr, mean_tpr, #color='b',
#             label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C0')

#     std_tpr = np.std(interp_tpr1s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
#                     )
#     mean_tpr = np.mean(interp_tpr2s, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(auc2)
#     plt.plot(mean_fpr, mean_tpr, #color='',
#             label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#             lw=2, alpha=.8,color='C1')

#     std_tpr = np.std(interp_tpr2s, axis=0)
#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
#                     )
#     plt.legend()
#     plt.xlim([0,1])
#     plt.ylim([0,1])
    
#     #histogram of mean AUCs
#     plt.subplot(nModels,3,ind*3 + 2)
#     plt.hist(auc1,alpha=0.5,label = 'no PAT')
#     plt.hist(auc2,alpha=0.5,label = 'with PAT feature')

    
#     #scatter plot for AUC with line of unity
#     plt.subplot(nModels,3,ind*3 + 3)
#     plt.scatter(auc1,auc2)
#     # plt.axis('equal')
#     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
#     minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
#     maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
#     plt.xlim([minAx,maxAx])
#     plt.ylim([minAx,maxAx])
    
# plt.savefig(os.path.join('.','graphs','cheat ' +yVar + '.png'))

In [None]:
yVar = 'atherosclerosis'

y = pericardialArea[yVar].astype(bool)
x1 = pericardialArea.loc[:,('Male','Age (years)','BMI')]
x2 = pericardialArea.loc[:,('Male','Age (years)','BMI','meanArea (cm2)')]

randomSeed = 42
nFolds = 10

mean_fpr = np.linspace(0, 1, 100)

#SHOW THE PRETTY GRAPHS
plt.figure(figsize= (21*5,5))

cRange=np.logspace(-3,0,10)

# for ind,c in enumerate(cRange):
c = 0.025
#instantiate k-folds classification
splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

auc1 = []
auc2 = []
interp_tpr1s = []
interp_tpr2s = []

for i,(train,test) in enumerate(splitter.split(x1,y)):    

    #instantiate classifiers
    lr1 = SVC(kernel="linear", C=c)
    lr2 = SVC(kernel="linear", C=c)

    #fit classifiers with the two different datasets
    lr1.fit(X=x1.iloc[train],y=y.iloc[train])
    lr2.fit(X=x2.iloc[train],y=y.iloc[train])

    #scores
    sc1 = lr1.decision_function(x1.iloc[test])
    sc2 = lr2.decision_function(x2.iloc[test])

    #ROC values
    fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
    fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

    #ROC AUC values
    auc1.append(auc(fpr1,tpr1))
    auc2.append(auc(fpr2,tpr2))

    #interpolate so that we can show error bars..
    interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
    interp_tpr1[0] = 0.0
    interp_tpr1s.append(interp_tpr1)

    interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
    interp_tpr2[0] = 0.0
    interp_tpr2s.append(interp_tpr2)

plt.subplot(1,3,ind*3 + 1)

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
        label='Chance', alpha=.8)
plt.ylabel('C = ' + str(c))

mean_tpr = np.mean(interp_tpr1s, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(auc1)
plt.plot(mean_fpr, mean_tpr, #color='b',
        label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8,color='C0')

std_tpr = np.std(interp_tpr1s, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
                )
mean_tpr = np.mean(interp_tpr2s, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(auc2)
plt.plot(mean_fpr, mean_tpr, #color='',
        label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8,color='C1')

std_tpr = np.std(interp_tpr2s, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
                )
plt.legend()
plt.xlim([0,1])
plt.ylim([0,1])

plt.subplot(1,3,ind*3 + 2)
plt.hist(auc1,alpha=0.5,label = 'no PAT')
plt.hist(auc2,alpha=0.5,label = 'with PAT feature')


plt.subplot(1,3,ind*3 + 3)
plt.scatter(auc1,auc2)
# plt.axis('equal')
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
plt.xlim([minAx,maxAx])
plt.ylim([minAx,maxAx])
    
plt.savefig('./graphs/cheat_SVM_c025_' + yVar + '.png')

In [None]:
yVar = 'athHD'

y = pericardialArea[yVar].astype(bool)
x1 = pericardialArea.loc[:,('Male','Age (years)','BMI')]
x2 = pericardialArea.loc[:,('Male','Age (years)','BMI','meanArea (cm2)')]

randomSeed = 42
nFolds = 5

mean_fpr = np.linspace(0, 1, 100)

#SHOW THE PRETTY GRAPHS
plt.figure(figsize= (21*5,5))

cRange=np.logspace(-3,0,10)

for ind,c in enumerate(cRange):
    #instantiate k-folds classification
    splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

    auc1 = []
    auc2 = []
    interp_tpr1s = []
    interp_tpr2s = []

    for i,(train,test) in enumerate(splitter.split(x1,y)):    

        #instantiate classifiers
        lr1 = SVC(kernel="linear", C=c)
        lr2 = SVC(kernel="linear", C=c)

        #fit classifiers with the two different datasets
        lr1.fit(X=x1.iloc[train],y=y.iloc[train])
        lr2.fit(X=x2.iloc[train],y=y.iloc[train])

        #scores
        sc1 = lr1.decision_function(x1.iloc[test])
        sc2 = lr2.decision_function(x2.iloc[test])

        #ROC values
        fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
        fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

        #ROC AUC values
        auc1.append(auc(fpr1,tpr1))
        auc2.append(auc(fpr2,tpr2))

        #interpolate so that we can show error bars..
        interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
        interp_tpr1[0] = 0.0
        interp_tpr1s.append(interp_tpr1)

        interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
        interp_tpr2[0] = 0.0
        interp_tpr2s.append(interp_tpr2)

    plt.subplot(21,3,ind*3 + 1)

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
            label='Chance', alpha=.8)
    plt.ylabel('C = ' + str(c))

    mean_tpr = np.mean(interp_tpr1s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc1)
    plt.plot(mean_fpr, mean_tpr, #color='b',
            label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C0')

    std_tpr = np.std(interp_tpr1s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
                    )
    mean_tpr = np.mean(interp_tpr2s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc2)
    plt.plot(mean_fpr, mean_tpr, #color='',
            label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C1')

    std_tpr = np.std(interp_tpr2s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
                    )
    plt.legend()
    plt.xlim([0,1])
    plt.ylim([0,1])

    plt.subplot(21,3,ind*3 + 2)
    plt.hist(auc1,alpha=0.5,label = 'no PAT')
    plt.hist(auc2,alpha=0.5,label = 'with PAT feature')


    plt.subplot(21,3,ind*3 + 3)
    plt.scatter(auc1,auc2)
    # plt.axis('equal')
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
    minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
    maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
    plt.xlim([minAx,maxAx])
    plt.ylim([minAx,maxAx])
    
plt.savefig('./graphs/cheat_SVM_c_' + yVar + '.png')

In [None]:
yVar = 'atherosclerosis'

y = pericardialArea[yVar].astype(bool)
x1 = pericardialArea.loc[:,('Male','Age (years)','BMI')]
x2 = pericardialArea.loc[:,('Male','Age (years)','BMI','meanArea (cm2)')]

randomSeed = 42
nFolds = 5

mean_fpr = np.linspace(0, 1, 100)

#SHOW THE PRETTY GRAPHS
plt.figure(figsize= (21*5,5))

cRange=np.logspace(-3,0,10)

for ind,c in enumerate(cRange):
    #instantiate k-folds classification
    splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

    auc1 = []
    auc2 = []
    interp_tpr1s = []
    interp_tpr2s = []

    for i,(train,test) in enumerate(splitter.split(x1,y)):    

        #instantiate classifiers
        lr1 = SVC(kernel="linear", C=c)
        lr2 = SVC(kernel="linear", C=c)

        #fit classifiers with the two different datasets
        lr1.fit(X=x1.iloc[train],y=y.iloc[train])
        lr2.fit(X=x2.iloc[train],y=y.iloc[train])

        #scores
        sc1 = lr1.decision_function(x1.iloc[test])
        sc2 = lr2.decision_function(x2.iloc[test])

        #ROC values
        fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
        fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

        #ROC AUC values
        auc1.append(auc(fpr1,tpr1))
        auc2.append(auc(fpr2,tpr2))

        #interpolate so that we can show error bars..
        interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
        interp_tpr1[0] = 0.0
        interp_tpr1s.append(interp_tpr1)

        interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
        interp_tpr2[0] = 0.0
        interp_tpr2s.append(interp_tpr2)

    plt.subplot(21,3,ind*3 + 1)

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
            label='Chance', alpha=.8)
    plt.ylabel('C = ' + str(c))

    mean_tpr = np.mean(interp_tpr1s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc1)
    plt.plot(mean_fpr, mean_tpr, #color='b',
            label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C0')

    std_tpr = np.std(interp_tpr1s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
                    )
    mean_tpr = np.mean(interp_tpr2s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc2)
    plt.plot(mean_fpr, mean_tpr, #color='',
            label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C1')

    std_tpr = np.std(interp_tpr2s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
                    )
    plt.legend()
    plt.xlim([0,1])
    plt.ylim([0,1])

    plt.subplot(21,3,ind*3 + 2)
    plt.hist(auc1,alpha=0.5,label = 'no PAT')
    plt.hist(auc2,alpha=0.5,label = 'with PAT feature')


    plt.subplot(21,3,ind*3 + 3)
    plt.scatter(auc1,auc2)
    # plt.axis('equal')
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
    minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
    maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
    plt.xlim([minAx,maxAx])
    plt.ylim([minAx,maxAx])
    
plt.savefig('./graphs/cheat_SVM_c_' + yVar + '.png')

In [None]:
yVar = 'diabetes'

y = pericardialArea[yVar].astype(bool)
x1 = pericardialArea.loc[:,('Male','Age (years)','BMI')]
x2 = pericardialArea.loc[:,('Male','Age (years)','BMI','meanArea (cm2)')]

randomSeed = 42
nFolds = 5

mean_fpr = np.linspace(0, 1, 100)

#SHOW THE PRETTY GRAPHS
plt.figure(figsize= (21*5,5))

cRange=np.logspace(-3,0,10)

for ind,c in enumerate(cRange):
    #instantiate k-folds classification
    splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

    auc1 = []
    auc2 = []
    interp_tpr1s = []
    interp_tpr2s = []

    for i,(train,test) in enumerate(splitter.split(x1,y)):    

        #instantiate classifiers
        lr1 = SVC(kernel="linear", C=c)
        lr2 = SVC(kernel="linear", C=c)

        #fit classifiers with the two different datasets
        lr1.fit(X=x1.iloc[train],y=y.iloc[train])
        lr2.fit(X=x2.iloc[train],y=y.iloc[train])

        #scores
        sc1 = lr1.decision_function(x1.iloc[test])
        sc2 = lr2.decision_function(x2.iloc[test])

        #ROC values
        fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
        fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

        #ROC AUC values
        auc1.append(auc(fpr1,tpr1))
        auc2.append(auc(fpr2,tpr2))

        #interpolate so that we can show error bars..
        interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
        interp_tpr1[0] = 0.0
        interp_tpr1s.append(interp_tpr1)

        interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
        interp_tpr2[0] = 0.0
        interp_tpr2s.append(interp_tpr2)

    plt.subplot(21,3,ind*3 + 1)

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
            label='Chance', alpha=.8)
    plt.ylabel('C = ' + str(c))

    mean_tpr = np.mean(interp_tpr1s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc1)
    plt.plot(mean_fpr, mean_tpr, #color='b',
            label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C0')

    std_tpr = np.std(interp_tpr1s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
                    )
    mean_tpr = np.mean(interp_tpr2s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc2)
    plt.plot(mean_fpr, mean_tpr, #color='',
            label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C1')

    std_tpr = np.std(interp_tpr2s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
                    )
    plt.legend()
    plt.xlim([0,1])
    plt.ylim([0,1])

    plt.subplot(21,3,ind*3 + 2)
    plt.hist(auc1,alpha=0.5,label = 'no PAT')
    plt.hist(auc2,alpha=0.5,label = 'with PAT feature')


    plt.subplot(21,3,ind*3 + 3)
    plt.scatter(auc1,auc2)
    # plt.axis('equal')
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
    minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
    maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
    plt.xlim([minAx,maxAx])
    plt.ylim([minAx,maxAx])
    
plt.savefig('./graphs/cheat_SVM_c_' + yVar + '.png')

In [None]:
yVar = 'myocardial infarction'

y = pericardialArea[yVar].astype(bool)
x1 = pericardialArea.loc[:,('Male','Age (years)','BMI')]
x2 = pericardialArea.loc[:,('Male','Age (years)','BMI','meanArea (cm2)')]

randomSeed = 42
nFolds = 5

mean_fpr = np.linspace(0, 1, 100)

#SHOW THE PRETTY GRAPHS
plt.figure(figsize= (21*5,5))

cRange=np.logspace(-3,0,10)

for ind,c in enumerate(cRange):
    #instantiate k-folds classification
    splitter = StratifiedKFold(n_splits=nFolds,random_state=randomSeed,shuffle=True)

    auc1 = []
    auc2 = []
    interp_tpr1s = []
    interp_tpr2s = []

    for i,(train,test) in enumerate(splitter.split(x1,y)):    

        #instantiate classifiers
        lr1 = SVC(kernel="linear", C=c)
        lr2 = SVC(kernel="linear", C=c)

        #fit classifiers with the two different datasets
        lr1.fit(X=x1.iloc[train],y=y.iloc[train])
        lr2.fit(X=x2.iloc[train],y=y.iloc[train])

        #scores
        sc1 = lr1.decision_function(x1.iloc[test])
        sc2 = lr2.decision_function(x2.iloc[test])

        #ROC values
        fpr1,tpr1,thresh = roc_curve(y_true=y.iloc[test], y_score=sc1)
        fpr2,tpr2,thresh = roc_curve(y_true=y.iloc[test], y_score=sc2)

        #ROC AUC values
        auc1.append(auc(fpr1,tpr1))
        auc2.append(auc(fpr2,tpr2))

        #interpolate so that we can show error bars..
        interp_tpr1 = np.interp(mean_fpr, fpr1, tpr1)
        interp_tpr1[0] = 0.0
        interp_tpr1s.append(interp_tpr1)

        interp_tpr2 = np.interp(mean_fpr, fpr2, tpr2)
        interp_tpr2[0] = 0.0
        interp_tpr2s.append(interp_tpr2)

    plt.subplot(21,3,ind*3 + 1)

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
            label='Chance', alpha=.8)
    plt.ylabel('C = ' + str(c))

    mean_tpr = np.mean(interp_tpr1s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc1)
    plt.plot(mean_fpr, mean_tpr, #color='b',
            label=r'No PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C0')

    std_tpr = np.std(interp_tpr1s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C0', alpha=.2,
                    )
    mean_tpr = np.mean(interp_tpr2s, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc2)
    plt.plot(mean_fpr, mean_tpr, #color='',
            label=r'With PAT area, mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8,color='C1')

    std_tpr = np.std(interp_tpr2s, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='C1', alpha=.2,
                    )
    plt.legend()
    plt.xlim([0,1])
    plt.ylim([0,1])

    plt.subplot(21,3,ind*3 + 2)
    plt.hist(auc1,alpha=0.5,label = 'no PAT')
    plt.hist(auc2,alpha=0.5,label = 'with PAT feature')


    plt.subplot(21,3,ind*3 + 3)
    plt.scatter(auc1,auc2)
    # plt.axis('equal')
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
    minAx = np.min(np.concatenate((auc1,auc2))) - 0.05
    maxAx = np.max(np.concatenate((auc1,auc2))) + 0.05
    plt.xlim([minAx,maxAx])
    plt.ylim([minAx,maxAx])
    
plt.savefig('./graphs/cheat_SVM_c_' + yVar + '.png')