### Import

In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
import os
import glob
import copy
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

### Read every cutoffs from each method

In [2]:
datasets_1 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/gmm/cutoffs/' + "/*."+'csv'))]
files_1 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/gmm/cutoffs/'))]

datasets_2 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/kmeans/cutoffs/' + "/*."+'csv'))]
files_2 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/kmeans/cutoffs/'))]

datasets_3 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/roc/cutoffs/' + "/*."+'csv'))]
files_3 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/roc/cutoffs/'))]

datasets_4 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/tertile/cutoffs/' + "/*."+'csv'))]
files_4 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/tertile/cutoffs/'))]

datasets_5 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/mean_sd/cutoffs/' + "/*."+'csv'))]
files_5 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/mean_sd/cutoffs/'))]

In [3]:
# make a dictionary that contains all cutoffs as a dataframe
gmm = dict()
kmeans = dict()
roc = dict()
tertile = dict()
mean_sd = dict()

for file_, name_ in zip(datasets_1, files_1):
    gmm[name_] = file_
    
for file_, name_ in zip(datasets_2, files_2):
    kmeans[name_] = file_
    
for file_, name_ in zip(datasets_3, files_3):
    roc[name_] = file_
    
for file_, name_ in zip(datasets_4, files_4):
    tertile[name_] = file_
    
for file_, name_ in zip(datasets_5, files_5):
    mean_sd[name_] = file_

In [4]:
#rename columns before combining the dataframes 
for i in kmeans:
    for column in kmeans[i].columns:
        kmeans[i].rename(columns={column: column.split('_')[0]}, inplace=True)

### Preprocess the cutoffs
* make a dictionary with all the cutoffs
    * keys would be the cohort names
    * values would be a dataframe with all the cutoffs of CSF biomarkers

In [5]:
def cohort_based_cutoffs(method_dfs):
    """ """

#     biomarkers = method_dfs['gmm_cutoffs_0'].columns.to_list()
    cutpoints = {i: pd.DataFrame(index=list(range(1000)), 
                columns=method_dfs[list(method_dfs.keys())[0]].columns) 
                for i in method_dfs[list(method_dfs.keys())[0]].index}

    for bioma in cutpoints['ADNI'].columns:
        
        for cohort in method_dfs[list(method_dfs.keys())[0]].index:
            
            for i in method_dfs:
                cutpoints[cohort].loc[int(i.split('_')[-1]), bioma] = method_dfs[i].loc[cohort, bioma]
                
    return cutpoints

In [16]:
gmm_new = cohort_based_cutoffs(gmm)
kmeans_new = cohort_based_cutoffs(kmeans)
roc_new = cohort_based_cutoffs(roc)
tertile_new = cohort_based_cutoffs(tertile)
mean_sd_new = cohort_based_cutoffs(mean_sd)

### Calculate the confidence intervals of CSF cutoffs for each method 

In [17]:
# results = {i: pd.DataFrame(index=['GMM', 'ROC', 'K-Means', 'Tertile', 'Mean ±2 SD'], 
#            columns=gmm[list(gmm.keys())[0]].columns) 
#             for i in gmm[list(gmm.keys())[0]].index}

results = {i: pd.DataFrame(index=gmm[list(gmm.keys())[0]].index, 
           columns=gmm[list(gmm.keys())[0]].columns) 
            for i in ['GMM', 'K-Means', 'Tertile', 'ROC', 'Mean ±2 SD']}

results_percentage = {i: pd.DataFrame(index=gmm[list(gmm.keys())[0]].index, 
           columns=gmm[list(gmm.keys())[0]].columns) 
            for i in ['GMM', 'K-Means', 'Tertile', 'ROC', 'Mean ±2 SD']}

In [18]:
def mean_confidence_interval(methodologies, results, confidence=0.95):
    
    
    for method in methodologies:    
        datasets_ = methodologies[method]

        for i in datasets_:

            for col in datasets_[i].columns:
    #             data = datasets_[i][col].to_list()
                a = 1.0 * np.array(datasets_[i][col])
                n = len(a)
                m, se = np.nanmean(a, 0), stats.sem(a, nan_policy="omit")
                h = se * stats.t.ppf((1 + confidence) / 2., n-1)
#                 results[i].loc[method, col] = f'{round(m, 2)} [{round(m-h, 2)}, {round(m+h, 2)}]'
                results[method].loc[i, col] = f'{round(m, 1)} [{round(m-h, 1)}, {round(m+h, 1)}]'
                results_percentage[method].loc[i, col] = f'{round((((m+h) - (m-h))/m) * 100, 2)} %'

In [19]:
methodologies = {'GMM': gmm_new, 'K-Means': kmeans_new, 'ROC': roc_new, 'Tertile': tertile_new, 
                 'Mean ±2 SD': mean_sd_new}

In [20]:
mean_confidence_interval(methodologies, results)

In [22]:
results = pd.concat(results, axis=1)

In [24]:
results.to_csv("../results/bootstrap/confidence_intervals.csv")

In [25]:
results

Unnamed: 0_level_0,GMM,GMM,GMM,K-Means,K-Means,K-Means,Tertile,Tertile,Tertile,ROC,ROC,ROC,Mean ±2 SD,Mean ±2 SD,Mean ±2 SD
Unnamed: 0_level_1,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,"986.9 [986.0, 987.9]","34.7 [34.6, 34.8]","356.9 [356.1, 357.7]","972.6 [971.6, 973.6]","28.1 [28.0, 28.1]","288.2 [287.8, 288.6]","833.3 [831.2, 835.3]","21.8 [21.8, 21.9]","240.1 [239.5, 240.6]","819.4 [815.8, 823.1]","23.6 [23.5, 23.7]","259.0 [258.0, 260.0]","283.8 [281.8, 285.8]","40.7 [40.6, 40.8]","408.5 [407.6, 409.4]"
EPAD,"1035.7 [1035.0, 1036.4]","27.8 [27.7, 27.8]","307.0 [306.6, 307.4]","1033.3 [1032.6, 1034.0]","19.8 [19.8, 19.9]","223.7 [223.5, 223.9]","884.5 [883.5, 885.4]","19.9 [19.8, 19.9]","228.2 [227.9, 228.5]","684.7 [680.5, 688.9]","23.9 [23.6, 24.3]","267.9 [263.3, 272.4]","339.0 [338.1, 339.9]","43.3 [43.2, 43.4]","435.9 [435.2, 436.7]"
AIBL,"749.0 [743.2, 754.8]","87.8 [86.8, 88.9]","604.5 [597.0, 612.0]","684.0 [681.2, 686.7]","77.7 [77.0, 78.4]","530.6 [523.3, 537.9]",,,,,,,,,
ARWIBO,"589.2 [585.0, 593.5]","155.0 [148.2, 161.9]","531.6 [526.7, 536.4]","523.8 [521.7, 526.0]","69.1 [68.9, 69.4]","432.1 [428.8, 435.4]",,,,,,,,,
EDSD,"782.2 [776.1, 788.3]","119.2 [117.7, 120.8]","584.8 [576.2, 593.5]","741.8 [736.1, 747.5]","86.7 [86.0, 87.4]","547.6 [543.0, 552.2]",,,,,,,,,
PREVENT-AD,"1187.4 [1158.6, 1216.2]","71.1 [70.3, 72.0]","466.5 [457.2, 475.8]","1146.4 [1144.1, 1148.8]","50.8 [50.6, 51.0]","308.8 [306.6, 310.9]","1098.7 [1096.8, 1100.6]","53.0 [52.9, 53.1]","303.3 [302.5, 304.1]",,,,"593.6 [589.7, 597.6]","89.2 [88.9, 89.6]","593.3 [589.5, 597.2]"
PharmaCog,"792.9 [791.1, 794.7]","93.0 [92.7, 93.3]","773.4 [750.8, 796.0]","762.2 [759.5, 764.8]","68.6 [68.0, 69.2]","466.9 [464.6, 469.2]",,,,,,,,,
NACC_ELISA,"594.1 [592.6, 595.7]","81.7 [80.9, 82.5]","564.6 [552.4, 576.9]","586.1 [584.9, 587.2]","61.6 [61.4, 61.8]","492.1 [490.2, 493.9]","625.5 [623.6, 627.5]","48.3 [48.1, 48.4]","371.3 [369.6, 373.0]","451.5 [448.1, 454.8]","61.2 [60.9, 61.6]","500.6 [496.0, 505.1]","294.4 [292.6, 296.2]","87.3 [86.8, 87.8]","725.3 [722.0, 728.7]"
EMIF_ELISA,"742.1 [741.2, 743.0]","82.1 [81.9, 82.3]","526.6 [524.2, 528.9]","697.7 [695.2, 700.1]","64.9 [64.8, 65.0]","432.5 [431.7, 433.4]","541.0 [539.7, 542.2]","52.3 [52.2, 52.4]","268.8 [268.0, 269.6]","580.2 [575.8, 584.5]","62.6 [62.1, 63.1]","335.7 [333.7, 337.8]","175.2 [173.7, 176.8]","94.8 [94.3, 95.3]","653.8 [645.4, 662.1]"
NACC_XMAP,"298.7 [297.0, 300.4]","60.8 [59.9, 61.7]","94.5 [93.8, 95.3]","290.7 [288.4, 293.0]","40.1 [40.0, 40.3]","70.2 [70.0, 70.4]","248.5 [247.7, 249.3]","36.5 [36.4, 36.6]","54.0 [53.8, 54.2]","225.3 [224.8, 225.9]","40.3 [40.1, 40.5]","62.7 [62.1, 63.3]","23.6 [22.5, 24.6]","68.9 [68.5, 69.3]","99.3 [98.8, 99.7]"


In [26]:
pd.concat(results_percentage, axis=1)

Unnamed: 0_level_0,GMM,GMM,GMM,K-Means,K-Means,K-Means,Tertile,Tertile,Tertile,ROC,ROC,ROC,Mean ±2 SD,Mean ±2 SD,Mean ±2 SD
Unnamed: 0_level_1,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,0.19 %,0.64 %,0.47 %,0.21 %,0.25 %,0.26 %,0.48 %,0.5 %,0.48 %,0.89 %,0.85 %,0.77 %,1.4 %,0.53 %,0.45 %
EPAD,0.14 %,0.33 %,0.25 %,0.13 %,0.22 %,0.2 %,0.21 %,0.28 %,0.27 %,1.23 %,3.02 %,3.39 %,0.54 %,0.43 %,0.34 %
AIBL,1.54 %,2.39 %,2.48 %,0.8 %,1.81 %,2.75 %,,,,,,,,,
ARWIBO,1.44 %,8.85 %,1.84 %,0.81 %,0.72 %,1.52 %,,,,,,,,,
EDSD,1.56 %,2.59 %,2.95 %,1.54 %,1.61 %,1.68 %,,,,,,,,,
PREVENT-AD,4.84 %,2.46 %,3.99 %,0.41 %,0.9 %,1.39 %,0.35 %,0.52 %,0.51 %,,,,1.33 %,0.87 %,1.29 %
PharmaCog,0.45 %,0.61 %,5.85 %,0.69 %,1.71 %,0.98 %,,,,,,,,,
NACC_ELISA,0.51 %,1.99 %,4.33 %,0.39 %,0.69 %,0.76 %,0.62 %,0.55 %,0.9 %,1.47 %,1.16 %,1.82 %,1.22 %,1.11 %,0.93 %
EMIF_ELISA,0.24 %,0.46 %,0.9 %,0.7 %,0.27 %,0.38 %,0.45 %,0.39 %,0.57 %,1.51 %,1.51 %,1.22 %,1.78 %,1.08 %,2.56 %
NACC_XMAP,1.15 %,2.99 %,1.54 %,1.56 %,0.64 %,0.64 %,0.63 %,0.38 %,0.61 %,0.48 %,0.92 %,1.8 %,8.67 %,1.19 %,0.97 %


In [53]:
results_percentage['GMM']

Unnamed: 0,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,0.19,0.64,0.47
EPAD,0.14,0.33,0.25
AIBL,1.54,2.39,2.48
ARWIBO,1.44,8.85,1.84
EDSD,1.56,2.59,2.95
PREVENT-AD,4.84,2.46,3.99
PharmaCog,0.45,0.61,5.85
NACC_ELISA,0.51,1.99,4.33
EMIF_ELISA,0.24,0.46,0.9
NACC_XMAP,1.15,2.99,1.54
