### Import

In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
import os
import glob
import copy
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

### Read every cutoffs from each method

In [2]:
datasets_1 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/gmm/cutoffs/' + "/*."+'csv'))]
files_1 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/gmm/cutoffs/'))]

datasets_2 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/kmeans/cutoffs/' + "/*."+'csv'))]
files_2 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/kmeans/cutoffs/'))]

datasets_3 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/roc/cutoffs/' + "/*."+'csv'))]
files_3 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/roc/cutoffs/'))]

datasets_4 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/tertile/cutoffs/' + "/*."+'csv'))]
files_4 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/tertile/cutoffs/'))]

datasets_5 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/mean_sd/cutoffs/' + "/*."+'csv'))]
files_5 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/mean_sd/cutoffs/'))]

In [3]:
# make a dictionary that contains all cutoffs as a dataframe
gmm = dict()
kmeans = dict()
roc = dict()
tertile = dict()
mean_sd = dict()

for file_, name_ in zip(datasets_1, files_1):
    gmm[name_] = file_
    
for file_, name_ in zip(datasets_2, files_2):
    kmeans[name_] = file_
    
for file_, name_ in zip(datasets_3, files_3):
    roc[name_] = file_
    
for file_, name_ in zip(datasets_4, files_4):
    tertile[name_] = file_
    
for file_, name_ in zip(datasets_5, files_5):
    mean_sd[name_] = file_

In [4]:
#rename columns before combining the dataframes 
for i in kmeans:
    for column in kmeans[i].columns:
        kmeans[i].rename(columns={column: column.split('_')[0]}, inplace=True)

### Preprocess the cutoffs
* make a dictionary with all the cutoffs
    * keys would be the cohort names
    * values would be a dataframe with all the cutoffs of CSF biomarkers

In [5]:
def cohort_based_cutoffs(method_dfs):
    """ """

#     biomarkers = method_dfs['gmm_cutoffs_0'].columns.to_list()
    cutpoints = {i: pd.DataFrame(index=list(range(1000)), 
                columns=method_dfs[list(method_dfs.keys())[0]].columns) 
                for i in method_dfs[list(method_dfs.keys())[0]].index}

    for bioma in cutpoints['ADNI'].columns:
        
        for cohort in method_dfs[list(method_dfs.keys())[0]].index:
            
            for i in method_dfs:
                cutpoints[cohort].loc[int(i.split('_')[-1]), bioma] = method_dfs[i].loc[cohort, bioma]
                
    return cutpoints

In [6]:
gmm_new = cohort_based_cutoffs(gmm)
kmeans_new = cohort_based_cutoffs(kmeans)
roc_new = cohort_based_cutoffs(roc)
tertile_new = cohort_based_cutoffs(tertile)
mean_sd_new = cohort_based_cutoffs(mean_sd)

### Calculate the confidence intervals of CSF cutoffs for each method 

In [66]:
# results = {i: pd.DataFrame(index=['GMM', 'ROC', 'K-Means', 'Tertile', 'Mean ±2 SD'], 
#            columns=gmm[list(gmm.keys())[0]].columns) 
#             for i in gmm[list(gmm.keys())[0]].index}

results = {i: pd.DataFrame(index=gmm[list(gmm.keys())[0]].index, 
           columns=gmm[list(gmm.keys())[0]].columns) 
            for i in ['GMM', 'K-Means', 'Tertile', 'ROC', 'Mean ±2 SD']}

results_percentage = {i: pd.DataFrame(index=gmm[list(gmm.keys())[0]].index, 
           columns=gmm[list(gmm.keys())[0]].columns) 
            for i in ['GMM', 'K-Means', 'Tertile', 'ROC', 'Mean ±2 SD']}

In [67]:
def mean_confidence_interval(methodologies, results, confidence=0.95):
    
    
    for method in methodologies:    
        datasets_ = methodologies[method]

        for i in datasets_:

            for col in datasets_[i].columns:
    #             data = datasets_[i][col].to_list()
                a = 1.0 * np.array(datasets_[i][col])
                n = len(a)
                m, se = np.nanmean(a, 0), stats.sem(a, nan_policy="omit")
                h = se * stats.t.ppf((1 + confidence) / 2., n-1)
#                 results[i].loc[method, col] = f'{round(m, 2)} [{round(m-h, 2)}, {round(m+h, 2)}]'
                results[method].loc[i, col] = f'{round(m, 2)} [{round(m-h, 2)}, {round(m+h, 2)}]'
                results_percentage[method].loc[i, col] = f'{round((((m+h) - (m-h))/m) * 100, 2)} %'

In [68]:
methodologies = {'GMM': gmm_new, 'K-Means': kmeans_new, 'ROC': roc_new, 'Tertile': tertile_new, 
                 'Mean ±2 SD': mean_sd_new}

In [69]:
mean_confidence_interval(methodologies, results)

In [70]:
pd.concat(results, axis=1).dropna().to_csv("../results/bootstrap/confidence_intervals.csv")

In [71]:
results = pd.concat(results, axis=1)

In [73]:
results

Unnamed: 0_level_0,GMM,GMM,GMM,K-Means,K-Means,K-Means,Tertile,Tertile,Tertile,ROC,ROC,ROC,Mean ±2 SD,Mean ±2 SD,Mean ±2 SD
Unnamed: 0_level_1,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,"986.93 [986.01, 987.85]","34.68 [34.57, 34.79]","356.89 [356.05, 357.73]","972.58 [971.58, 973.58]","28.08 [28.04, 28.11]","288.2 [287.82, 288.58]","833.25 [831.24, 835.27]","21.84 [21.79, 21.9]","240.07 [239.49, 240.64]","819.44 [815.79, 823.09]","23.58 [23.48, 23.68]","259.0 [258.0, 260.0]","283.83 [281.84, 285.82]","40.74 [40.63, 40.85]","408.5 [407.59, 409.41]"
EPAD,"1035.65 [1034.95, 1036.35]","27.78 [27.73, 27.83]","307.01 [306.62, 307.39]","1033.32 [1032.64, 1034.0]","19.85 [19.82, 19.87]","223.71 [223.48, 223.94]","884.48 [883.54, 885.43]","19.86 [19.83, 19.89]","228.16 [227.85, 228.47]","684.74 [680.52, 688.95]","23.93 [23.57, 24.29]","267.86 [263.32, 272.4]","339.02 [338.1, 339.94]","43.27 [43.18, 43.37]","435.94 [435.2, 436.67]"
AIBL,"749.01 [743.23, 754.79]","87.82 [86.77, 88.87]","604.49 [596.99, 611.99]","683.97 [681.22, 686.72]","77.67 [76.96, 78.37]","530.63 [523.33, 537.94]",,,,,,,,,
ARWIBO,"589.23 [584.97, 593.48]","155.03 [148.17, 161.89]","531.55 [526.67, 536.44]","523.83 [521.71, 525.95]","69.14 [68.89, 69.39]","432.09 [428.8, 435.38]",,,,,,,,,
EDSD,"782.2 [776.11, 788.3]","119.22 [117.68, 120.77]","584.84 [576.22, 593.47]","741.8 [736.1, 747.49]","86.71 [86.01, 87.41]","547.57 [542.96, 552.18]",,,,,,,,,
PREVENT-AD,"1187.4 [1158.64, 1216.16]","71.13 [70.26, 72.01]","466.54 [457.22, 475.85]","1146.42 [1144.08, 1148.75]","50.78 [50.55, 51.01]","308.78 [306.64, 310.93]","1098.71 [1096.81, 1100.61]","53.0 [52.86, 53.14]","303.28 [302.51, 304.05]",,,,"593.62 [589.69, 597.56]","89.24 [88.85, 89.63]","593.32 [589.49, 597.15]"
PharmaCog,"792.94 [791.15, 794.74]","92.99 [92.71, 93.27]","773.42 [750.81, 796.03]","762.16 [759.54, 764.78]","68.62 [68.03, 69.2]","466.9 [464.62, 469.18]",,,,,,,,,
NACC_ELISA,"594.13 [592.61, 595.66]","81.67 [80.86, 82.48]","564.65 [552.43, 576.87]","586.06 [584.92, 587.2]","61.63 [61.42, 61.84]","492.07 [490.2, 493.94]","625.54 [623.58, 627.49]","48.25 [48.12, 48.39]","371.3 [369.63, 372.98]","451.45 [448.13, 454.78]","61.21 [60.86, 61.57]","500.58 [496.03, 505.13]","294.38 [292.59, 296.17]","87.31 [86.82, 87.79]","725.33 [721.96, 728.7]"
EMIF_ELISA,"742.14 [741.23, 743.04]","82.08 [81.9, 82.27]","526.56 [524.2, 528.92]","697.67 [695.23, 700.1]","64.87 [64.78, 64.95]","432.54 [431.71, 433.37]","540.96 [539.73, 542.18]","52.28 [52.18, 52.39]","268.8 [268.03, 269.56]","580.15 [575.77, 584.54]","62.59 [62.11, 63.06]","335.71 [333.65, 337.76]","175.24 [173.68, 176.81]","94.82 [94.3, 95.33]","653.76 [645.39, 662.14]"
NACC_XMAP,"298.68 [296.97, 300.39]","60.84 [59.93, 61.75]","94.54 [93.82, 95.27]","290.71 [288.43, 292.98]","40.15 [40.02, 40.28]","70.22 [70.0, 70.44]","248.47 [247.68, 249.25]","36.49 [36.42, 36.56]","54.0 [53.83, 54.16]","225.31 [224.77, 225.85]","40.27 [40.09, 40.46]","62.7 [62.13, 63.26]","23.56 [22.54, 24.58]","68.87 [68.46, 69.28]","99.27 [98.79, 99.75]"


In [72]:
pd.concat(results_percentage, axis=1)

Unnamed: 0_level_0,GMM,GMM,GMM,K-Means,K-Means,K-Means,Tertile,Tertile,Tertile,ROC,ROC,ROC,Mean ±2 SD,Mean ±2 SD,Mean ±2 SD
Unnamed: 0_level_1,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,0.19 %,0.64 %,0.47 %,0.21 %,0.25 %,0.26 %,0.48 %,0.5 %,0.48 %,0.89 %,0.85 %,0.77 %,1.4 %,0.53 %,0.45 %
EPAD,0.14 %,0.33 %,0.25 %,0.13 %,0.22 %,0.2 %,0.21 %,0.28 %,0.27 %,1.23 %,3.02 %,3.39 %,0.54 %,0.43 %,0.34 %
AIBL,1.54 %,2.39 %,2.48 %,0.8 %,1.81 %,2.75 %,,,,,,,,,
ARWIBO,1.44 %,8.85 %,1.84 %,0.81 %,0.72 %,1.52 %,,,,,,,,,
EDSD,1.56 %,2.59 %,2.95 %,1.54 %,1.61 %,1.68 %,,,,,,,,,
PREVENT-AD,4.84 %,2.46 %,3.99 %,0.41 %,0.9 %,1.39 %,0.35 %,0.52 %,0.51 %,,,,1.33 %,0.87 %,1.29 %
PharmaCog,0.45 %,0.61 %,5.85 %,0.69 %,1.71 %,0.98 %,,,,,,,,,
NACC_ELISA,0.51 %,1.99 %,4.33 %,0.39 %,0.69 %,0.76 %,0.62 %,0.55 %,0.9 %,1.47 %,1.16 %,1.82 %,1.22 %,1.11 %,0.93 %
EMIF_ELISA,0.24 %,0.46 %,0.9 %,0.7 %,0.27 %,0.38 %,0.45 %,0.39 %,0.57 %,1.51 %,1.51 %,1.22 %,1.78 %,1.08 %,2.56 %
NACC_XMAP,1.15 %,2.99 %,1.54 %,1.56 %,0.64 %,0.64 %,0.63 %,0.38 %,0.61 %,0.48 %,0.92 %,1.8 %,8.67 %,1.19 %,0.97 %


In [53]:
results_percentage['GMM']

Unnamed: 0,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,0.19,0.64,0.47
EPAD,0.14,0.33,0.25
AIBL,1.54,2.39,2.48
ARWIBO,1.44,8.85,1.84
EDSD,1.56,2.59,2.95
PREVENT-AD,4.84,2.46,3.99
PharmaCog,0.45,0.61,5.85
NACC_ELISA,0.51,1.99,4.33
EMIF_ELISA,0.24,0.46,0.9
NACC_XMAP,1.15,2.99,1.54
