In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import pylab as plt

from sklearn import metrics

import os
import sys
sys.path.insert(1,'../scripts')
import plots

path = '/rds/general/user/aschalka/home/data/ppmi/analyses/classifyPDHC'
img_path = '/rds/general/user/aschalka/home/images/paper/prodromalPPMI'

In [None]:
# performance scores
classifiers = ['digital_tsfresh',
               'rbf_svm','poly_svm','rf',
               'digital_tsfresh_physical activity','digital_tsfresh_vital signs','digital_tsfresh_sleep',
              'digital_tsfresh_lastweek']
scores = pd.DataFrame(columns=['AUPRC'],index=pd.MultiIndex.from_product([classifiers,np.arange(5)],names=['classifier','cv']))
for i,cl in enumerate(classifiers):
    score = pd.read_csv(f'{path}/{cl}/test_scores.csv',index_col=0)
    scores.loc[(cl,slice(None)),'AUPRC'] = score.values

bl_classifiers = ['digital_tsfresh']
baselines = pd.DataFrame(columns=['AUPRC'],index=pd.MultiIndex.from_product([['baseline'],np.arange(5)],names=['classifier','cv']))
for i,cl in enumerate(bl_classifiers):
    baseline = pd.read_csv(f'{path}/{cl}/baseline/test_scores.csv',index_col=0)
    baselines.loc[('baseline',slice(None)),'AUPRC'] = baseline.values

In [None]:
scores = pd.concat([scores,baselines])

# AUPRC

In [None]:
# get tpr,fpr
curves = pd.DataFrame(columns=['precision','recall','tpr','fpr'],
                      index=pd.MultiIndex.from_product([np.hstack([classifiers,'baseline']),np.arange(5)],names=['classifier','cv']))
auroc = pd.DataFrame(columns=['AUROC'],
                     index=pd.MultiIndex.from_product([np.hstack([classifiers,'baseline']),np.arange(5)],names=['classifier','cv']))
for i,cl in enumerate(np.hstack([classifiers,'baseline'])):
    preds = []
    for cv in range(5):
        if cl == 'baseline':
            predictions = pd.read_csv(f'{path}/digital_tsfresh/baselinepredictions{cv}.csv')
        else:
            predictions = pd.read_csv(f'{path}/{cl}/predictions{cv}.csv')
        predictions['cv'] = cv
        preds.append(predictions[['participant','cv','pd','pred','pred_proba']])
        precision, recall, thresholds = metrics.precision_recall_curve(predictions['pd'],predictions['pred_proba'])
        fpr,tpr,_ = metrics.roc_curve(predictions['pd'],predictions['pred_proba'])
        curves.loc[(cl,cv),'precision'] = precision 
        curves.loc[(cl,cv),'recall'] = recall 
        curves.loc[(cl,cv),'tpr'] = tpr 
        curves.loc[(cl,cv),'fpr'] = fpr
        auroc.loc[(cl,cv),'AUROC'] = metrics.roc_auc_score(predictions['pd'],predictions['pred_proba'])
    preds = pd.concat(preds)

In [None]:
def string_to_array(st):
    try:
        return np.array(list(map(float,st[1:-1].split())))
    except:
        st = st.replace('...','')
        return np.array(list(map(float,st[1:-1].split())))

def plot_AUROC_CV(curves,pf,ax,features=[],fnames=[],title=''):
    if len(features) == 0:
        features = pf.index
    if len(fnames) == 0:
        fnames = features
    for (i,feature),fname in zip(enumerate(features),fnames):
        c = curves.loc[feature,:].copy()
        p = pf.loc[feature,:].copy()
        
        i_tprs = []
        mean_fpr = np.linspace(0, 1, 100)
        for cv in range(5):
            interp_tpr = np.interp(mean_fpr, c.loc['fpr',cv], c.loc['tpr',cv])
            interp_tpr[0] = 0.0
            i_tprs.append(interp_tpr)

        mean_tpr = np.mean(i_tprs, axis=0)
        mean_tpr[-1] = 1.0

        ax.plot(
            mean_fpr,
            mean_tpr,
            label=f"{fname} (AUROC = {p.loc['AUROC','mean']:.2f} $\pm$  {p.loc['AUROC','std']:.2f})",
            lw=2,
            alpha=0.8)

        std_tpr = np.std(i_tprs, axis=0) * 1.96
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        if i == 0:
            ax.fill_between(
                mean_fpr,
                tprs_lower,
                tprs_upper,
                color=ax.get_lines()[-1].get_c(),
                alpha=0.2,
                label=r"95% CI",
            )
        else:
            ax.fill_between(
            mean_fpr,
            tprs_lower,
            tprs_upper,
            color=ax.get_lines()[-1].get_c(),
            alpha=0.2,
        )

        ax.set(
            #xlim=[-0.05, 1.05],
            #ylim=[-0.05, 1.05],
            xlabel="False Positive Rate",
            ylabel="True Positive Rate",
            #title=f'{control} vs {label}'
        )
        ax.axis("square")
    ax.legend(title=f'{title}')
    
def plot_AUPRC_CV(curves,pf,ax,features=[],fnames=[],title=''):
    if len(features) == 0:
        features = pf.index
    if len(fnames) == 0:
        fnames = features
    for (i,feature),fname in zip(enumerate(features),fnames):
        c = curves.loc[feature,:].copy()
        p = pf.loc[feature,:].copy()
        
        i_tprs = []
        mean_fpr = np.linspace(0, 1, 100)
        for cv in range(5):
            interp_tpr = np.interp(mean_fpr, c.loc['recall',cv][::-1], c.loc['precision',cv][::-1])
            interp_tpr[0] = 0.0
            i_tprs.append(interp_tpr)

        mean_tpr = np.mean(i_tprs, axis=0)
        mean_tpr[0] = 0.0
        ax.plot(
            mean_fpr,
            mean_tpr,
            label=f"{fname} (AUPRC = {p.loc['AUPRC','mean']:.2f} $\pm$  {p.loc['AUPRC','std']:.2f})",
            lw=2,
            alpha=0.8)

        std_tpr = np.std(i_tprs, axis=0) * 1.96
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        if i == 0:
            ax.fill_between(
                mean_fpr,
                tprs_lower,
                tprs_upper,
                color=ax.get_lines()[-1].get_c(),
                alpha=0.2,
                label=r"95% CI",
            )
        else:
            ax.fill_between(
            mean_fpr,
            tprs_lower,
            tprs_upper,
            color=ax.get_lines()[-1].get_c(),
            alpha=0.2,
        )

        ax.set(
            xlabel="Recall",
            ylabel="Precision",
        )
        ax.axis("square")
    ax.legend(title=title)

In [None]:
curves = curves.unstack()

In [None]:
pf = auroc.groupby('classifier').agg(['mean','std'])
pf_ = scores.groupby('classifier').agg(['mean','std'])

In [None]:
pf_['AUPRC','std']/np.sqrt(5)

In [None]:
save = 'AUPRC'
subfigs = ['a','b','c','d']
fig, axes = plt.subplots(ncols=2,nrows=2,figsize=(15,15))
plots.plot_context()
features = [['digital_tsfresh','baseline'],
              ['digital_tsfresh', 'rbf_svm','poly_svm','rf'],
              ['digital_tsfresh', 'digital_tsfresh_physical activity','digital_tsfresh_vital signs','digital_tsfresh_sleep'],
              ['digital_tsfresh','digital_tsfresh_lastweek']]
fnames = [['digital risk score','baseline'],
              ['logistic regression', 'SVM (rbf)','SVM (polynomial)','RF'],
              ['union','physical activity','vital signs','sleep'],
              ['whole','week']]
for ax,kind,feat,fname,subfig in zip(axes.flatten(),['baseline','model','feature set','timeframe'],features,fnames,subfigs):
    plot_AUPRC_CV(curves,pf_,ax=ax,
                  features=feat,
                  fnames=fname,title=kind)
    ax.text(-0.1, 1, subfig, transform=ax.transAxes, 
                size=20, weight='bold')

plt.savefig(f'{img_path}/{save}_95CI.png',dpi=300,bbox_inches='tight')
plt.savefig(f'{img_path}/{save}_95CI.pdf',dpi=300,bbox_inches='tight')

In [None]:
save = 'AUROC'
subfigs = ['a','b','c','d']
fig, axes = plt.subplots(ncols=2,nrows=2,figsize=(15,15))
plots.plot_context()
features = [['digital_tsfresh','baseline'],
              ['digital_tsfresh', 'rbf_svm','poly_svm','rf'],
              ['digital_tsfresh', 'digital_tsfresh_physical activity','digital_tsfresh_vital signs','digital_tsfresh_sleep'],
              ['digital_tsfresh','digital_tsfresh_lastweek']]
fnames = [['digital risk score','baseline'],
              ['logistic regression', 'SVM (rbf)','SVM (polynomial)','RF'],
              ['union','physical activity','vital signs','sleep'],
              ['whole','week']]
for ax,kind,feat,fname,subfig in zip(axes.flatten(),['baseline','model','feature set','timeframe'],features,fnames,subfigs):
    plot_AUROC_CV(curves,pf,ax=ax,
                  features=feat,
                  fnames=fname,title=kind)
    ax.text(-0.1, 1, subfig, transform=ax.transAxes, 
                size=20, weight='bold')

plt.savefig(f'{img_path}/{save}_95CI.png',dpi=300,bbox_inches='tight')
plt.savefig(f'{img_path}/{save}_95CI.pdf',dpi=300,bbox_inches='tight')

In [None]:
save = 'timeframe'
subfigs = ['A','B']
fig, axes = plt.subplots(ncols=2,figsize=(15,5))
plots.plot_context()
for ax,kind,subfig in zip(axes.flatten(),['auroc','auprc'],subfigs):
    if kind=='auroc':
        plot_AUROC_CV(curves,pf,ax=ax,
                  features=['digital_tsfresh','digital_tsfresh_lastweek'],
                  fnames=['whole','week'],title='Time Frame')
    else:
        plot_AUPRC_CV(curves,pf_,ax=ax,
                  features=['digital_tsfresh','digital_tsfresh_lastweek'],
                  fnames=['whole','week'],title='Time Frame')
    ax.text(-0.1, 1, subfig, transform=ax.transAxes, 
                size=20, weight='bold')

plt.savefig(f'{img_path}/{save}.png',dpi=300,bbox_inches='tight')
plt.savefig(f'{img_path}/{save}.pdf',dpi=300,bbox_inches='tight')

In [None]:
save = 'featureset'
subfigs = ['A','B']
fig, axes = plt.subplots(ncols=2,figsize=(15,5))
plots.plot_context()
for ax,kind,subfig in zip(axes.flatten(),['auroc','auprc'],subfigs):
    if kind=='auroc':
        plot_AUROC_CV(curves,pf,ax=ax,
                  features=['digital_tsfresh','digital_tsfresh_physical activity','digital_tsfresh_vital signs','digital_tsfresh_sleep'],
                  fnames=['union','physical activity','vital signs','sleep'],title='Feature Set')
    else:
        plot_AUPRC_CV(curves,pf_,ax=ax,
                  features=['digital_tsfresh','digital_tsfresh_physical activity','digital_tsfresh_vital signs','digital_tsfresh_sleep'],
                  fnames=['union','physical activity','vital signs','sleep'],title='Feature Set')
    ax.text(-0.1, 1, subfig, transform=ax.transAxes, 
                size=20, weight='bold')

plt.savefig(f'{img_path}/{save}.png',dpi=300,bbox_inches='tight')
plt.savefig(f'{img_path}/{save}.pdf',dpi=300,bbox_inches='tight')

In [None]:
save = 'MLmodel'
subfigs = ['A','B']
fig, axes = plt.subplots(ncols=2,figsize=(15,5))
plots.plot_context()
for ax,kind,subfig in zip(axes.flatten(),['auroc','auprc'],subfigs):
    if kind=='auroc':
        plot_AUROC_CV(curves,pf,ax=ax,
                  features=['digital_tsfresh','rbf_svm','poly_svm','rf'],
                  fnames=['logistic regression','SVM (rbf)','SVM (polynomial)','RF'],title='ML model')
    else:
        plot_AUPRC_CV(curves,pf_,ax=ax,
                      features=['digital_tsfresh','rbf_svm','poly_svm','rf'],
                      fnames=['logistic regression','SVM (rbf)','SVM (polynomial)','RF'],title='ML model')
    ax.text(-0.1, 1, subfig, transform=ax.transAxes, 
                size=20, weight='bold')

plt.savefig(f'{img_path}/{save}.png',dpi=300,bbox_inches='tight')
plt.savefig(f'{img_path}/{save}.pdf',dpi=300,bbox_inches='tight')

In [None]:
clean = scores.astype(float).reset_index()
baselines = baselines.astype(float).reset_index()

In [None]:
pg.ttest(clean.loc[clean['classifier']=='digital_tsfresh','AUPRC'],baselines.loc[baselines['classifier']=='digital_tsfresh','AUPRC'])

In [None]:
ttests = pd.DataFrame(columns=['T',  'dof', 'alternative', 'p-val', 'CI95%' ,  'cohen-d','BF10','power'],index=classifiers)
for cl in classifiers[1:]:
    
    ttests.loc[cl] = pg.ttest(clean.loc[clean['classifier']=='digital_tsfresh','AUPRC'],clean.loc[clean['classifier']==cl,'AUPRC']).values

# Coefficients

In [None]:
def plot_coefs(coefs,save=[]):
    fig = plt.figure(figsize=(10,10))
    plots.plot_context()
    mean = coefs.groupby('predictor').mean()
    std = coefs.groupby('predictor').std()
    n = 5
    sem = std/np.sqrt(n)
    alpha = 0.05/mean.shape[0]
    print(alpha,mean.shape[0])
    critical_value =  stats.t.ppf(1 - alpha/2, df=n - 1)  # Two-tailed test
    error = critical_value * sem
    p_values = pd.DataFrame([stats.ttest_1samp(coefs.loc[(slice(None),c), 'coef'].astype(float),0)[1] for c in mean.index],index=mean.index,
                           columns=['p'])
    significant_coefs = mean[p_values['p'] < alpha]
    #if significant_coefs.shape[0]==0:
    #    print('none reached corrected p-thresh, use 0.05 instead')
    #    significant_coefs = mean[p_values['p'] < 0.05]
    ax = sns.barplot(y='predictor',x='coef',data=significant_coefs.reset_index().sort_values('coef'),color='gray')
    for i, (feature, mean_coef) in enumerate(significant_coefs.sort_values('coef').T.iteritems()):
        ax.errorbar(x=mean_coef, y=i, xerr=error.loc[feature], fmt='none', ecolor='black')
    if save:
        plt.savefig(f'{save}sign_coefs_acrossfolds.png',dpi=300,bbox_inches='tight')
        plt.savefig(f'{save}sign_coefs_acrossfolds.pdf',dpi=300,bbox_inches='tight')
        p_values.to_csv(f'{path}/{cl}/coefs_pvals.csv')
    plt.show()
    return p_values

In [None]:
for cl in [classifiers[-1]]:
    coefs = pd.read_csv(f'{path}/{cl}/coefs.csv',index_col=[0,1])
    pvals = plot_coefs(coefs,save=f'/scratch/c.c21013066/images/paper/digitalPPMI/{cl}_')
    print(pvals.sort_values('p').head(20))
    top = pvals.sort_values('p').index
    sta = coefs.groupby('predictor').agg(['mean','std'])
    sta['p'] = pvals.loc[sta.index,:]
    print(sta.sort_values('p').head(20))
    print(sta.shape)
    #print(coefs.groupby('predictor').agg(['mean','std'])[top])
    #print(coefs.groupby('predictor').agg(['mean','std']).dropna().sort_values(by=('coef', 'mean')))