### Import

In [47]:
import numpy as np
import pandas as pd 
from collections import Counter
import os
import glob
import copy
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

### Read every cutoffs from each method

In [48]:
datasets_1 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/gmm/cutoffs/' + "/*."+'csv'))]
files_1 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/gmm/cutoffs/'))]

datasets_2 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/kmeans/cutoffs/' + "/*."+'csv'))]
files_2 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/kmeans/cutoffs/'))]

datasets_3 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/roc/cutoffs/' + "/*."+'csv'))]
files_3 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/roc/cutoffs/'))]

datasets_4 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/tertile/cutoffs/' + "/*."+'csv'))]
files_4 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/tertile/cutoffs/'))]

datasets_5 = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/bootstrap/mean_sd/cutoffs/' + "/*."+'csv'))]
files_5 = [file.split(".")[0] for file in sorted(os.listdir('../results/bootstrap/mean_sd/cutoffs/'))]

In [49]:
# make a dictionary that contains all cutoffs as a dataframe
gmm = dict()
kmeans = dict()
roc = dict()
tertile = dict()
mean_sd = dict()

for file_, name_ in zip(datasets_1, files_1):
    gmm[name_] = file_
    
for file_, name_ in zip(datasets_2, files_2):
    kmeans[name_] = file_
    
for file_, name_ in zip(datasets_3, files_3):
    roc[name_] = file_
    
for file_, name_ in zip(datasets_4, files_4):
    tertile[name_] = file_
    
for file_, name_ in zip(datasets_5, files_5):
    mean_sd[name_] = file_

In [50]:
#rename columns before combining the dataframes 
for i in kmeans:
    for column in kmeans[i].columns:
        kmeans[i].rename(columns={column: column.split('_')[0]}, inplace=True)

### Preprocess the cutoffs
* make a dictionary with all the cutoffs
    * keys would be the cohort names
    * values would be a dataframe with all the cutoffs of CSF biomarkers

In [51]:
def cohort_based_cutoffs(method_dfs):
    """ """

#     biomarkers = method_dfs['gmm_cutoffs_0'].columns.to_list()
    cutpoints = {i: pd.DataFrame(index=list(range(1000)), 
                columns=method_dfs[list(method_dfs.keys())[0]].columns) 
                for i in method_dfs[list(method_dfs.keys())[0]].index}

    for bioma in cutpoints['ADNI'].columns:
        
        for cohort in method_dfs[list(method_dfs.keys())[0]].index:
            
            for i in method_dfs:
                cutpoints[cohort].loc[int(i.split('_')[-1]), bioma] = method_dfs[i].loc[cohort, bioma]
                
    return cutpoints

In [52]:
gmm_new = cohort_based_cutoffs(gmm)
kmeans_new = cohort_based_cutoffs(kmeans)
roc_new = cohort_based_cutoffs(roc)
tertile_new = cohort_based_cutoffs(tertile)
mean_sd_new = cohort_based_cutoffs(mean_sd)

### Calculate the confidence intervals of CSF cutoffs for each method 

In [53]:
# results = {i: pd.DataFrame(index=['GMM', 'ROC', 'K-Means', 'Tertile', 'Mean ±2 SD'], 
#            columns=gmm[list(gmm.keys())[0]].columns) 
#             for i in gmm[list(gmm.keys())[0]].index}

results = {i: pd.DataFrame(index=gmm[list(gmm.keys())[0]].index, 
           columns=gmm[list(gmm.keys())[0]].columns) 
            for i in ['GMM', 'K-Means', 'Tertile', 'ROC', 'Mean ±2 SD']}

results_percentage = {i: pd.DataFrame(index=gmm[list(gmm.keys())[0]].index, 
           columns=gmm[list(gmm.keys())[0]].columns) 
            for i in ['GMM', 'K-Means', 'Tertile', 'ROC', 'Mean ±2 SD']}

In [54]:
def mean_confidence_interval(methodologies, results, confidence=0.8):
    
    
    for method in methodologies:    
        datasets_ = methodologies[method]

        for i in datasets_:

            for col in datasets_[i].columns:
    #             data = datasets_[i][col].to_list()
                a = 1.0 * np.array(datasets_[i][col])
                n = len(a)
                m, se = np.nanmean(a, 0), stats.sem(a, nan_policy="omit")
                h = se * stats.t.ppf((1 + confidence) / 2., n-1)
#                 results[i].loc[method, col] = f'{round(m, 2)} [{round(m-h, 2)}, {round(m+h, 2)}]'
                results[method].loc[i, col] = f'{round(m, 1)} [{round(m-h, 1)}, {round(m+h, 1)}]'
                results_percentage[method].loc[i, col] = f'{round((((m+h) - (m-h))/m) * 100, 2)} %'

In [55]:
methodologies = {'GMM': gmm_new, 'K-Means': kmeans_new, 'ROC': roc_new, 'Tertile': tertile_new, 
                 'Mean ±2 SD': mean_sd_new}

In [56]:
mean_confidence_interval(methodologies, results)

In [57]:
pd.concat(results, axis=1).to_csv("../results/bootstrap/confidence_intervals_80.csv")

In [58]:
results = pd.concat(results, axis=1)

In [76]:
# for i,j in enumerate(results.columns): 
#     print(j[1].split(" in CSF")[0] + '_' + j[0])

In [59]:
results.replace({np.nan: "-"})

Unnamed: 0_level_0,GMM,GMM,GMM,K-Means,K-Means,K-Means,Tertile,Tertile,Tertile,ROC,ROC,ROC,Mean ±2 SD,Mean ±2 SD,Mean ±2 SD
Unnamed: 0_level_1,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,"986.9 [986.3, 987.5]","34.7 [34.6, 34.7]","356.9 [356.3, 357.4]","972.6 [971.9, 973.2]","28.1 [28.1, 28.1]","288.2 [288.0, 288.4]","833.3 [831.9, 834.6]","21.8 [21.8, 21.9]","240.1 [239.7, 240.4]","819.4 [817.1, 821.8]","23.6 [23.5, 23.6]","259.0 [258.3, 259.7]","283.8 [282.5, 285.1]","40.7 [40.7, 40.8]","408.5 [407.9, 409.1]"
EPAD,"1035.7 [1035.2, 1036.1]","27.8 [27.8, 27.8]","307.0 [306.8, 307.3]","1033.3 [1032.9, 1033.8]","19.8 [19.8, 19.9]","223.7 [223.6, 223.9]","884.5 [883.9, 885.1]","19.9 [19.8, 19.9]","228.2 [228.0, 228.4]","684.7 [682.0, 687.5]","23.9 [23.7, 24.2]","267.9 [264.9, 270.8]","339.0 [338.4, 339.6]","43.3 [43.2, 43.3]","435.9 [435.5, 436.4]"
AIBL,"749.0 [745.2, 752.8]","87.8 [87.1, 88.5]","604.5 [599.6, 609.4]","684.0 [682.2, 685.8]","77.7 [77.2, 78.1]","530.6 [525.9, 535.4]",-,-,-,-,-,-,-,-,-
ARWIBO,"589.2 [586.4, 592.0]","155.0 [150.6, 159.5]","531.6 [528.4, 534.7]","523.8 [522.4, 525.2]","69.1 [69.0, 69.3]","432.1 [429.9, 434.2]",-,-,-,-,-,-,-,-,-
EDSD,"782.2 [778.2, 786.2]","119.2 [118.2, 120.2]","584.8 [579.2, 590.5]","741.8 [738.1, 745.5]","86.7 [86.3, 87.2]","547.6 [544.6, 550.6]",-,-,-,-,-,-,-,-,-
PREVENT-AD,"1187.4 [1168.6, 1206.2]","71.1 [70.6, 71.7]","466.5 [460.4, 472.6]","1146.4 [1144.9, 1147.9]","50.8 [50.6, 50.9]","308.8 [307.4, 310.2]","1098.7 [1097.5, 1100.0]","53.0 [52.9, 53.1]","303.3 [302.8, 303.8]",-,-,-,"593.6 [591.1, 596.2]","89.2 [89.0, 89.5]","593.3 [590.8, 595.8]"
PharmaCog,"792.9 [791.8, 794.1]","93.0 [92.8, 93.2]","773.4 [758.6, 788.2]","762.2 [760.4, 763.9]","68.6 [68.2, 69.0]","466.9 [465.4, 468.4]",-,-,-,-,-,-,-,-,-
NACC_ELISA,"594.1 [593.1, 595.1]","81.7 [81.1, 82.2]","564.6 [556.7, 572.6]","586.1 [585.3, 586.8]","61.6 [61.5, 61.8]","492.1 [490.8, 493.3]","625.5 [624.3, 626.8]","48.3 [48.2, 48.3]","371.3 [370.2, 372.4]","451.5 [449.3, 453.6]","61.2 [61.0, 61.4]","500.6 [497.6, 503.6]","294.4 [293.2, 295.6]","87.3 [87.0, 87.6]","725.3 [723.1, 727.5]"
EMIF_ELISA,"742.1 [741.5, 742.7]","82.1 [82.0, 82.2]","526.6 [525.0, 528.1]","697.7 [696.1, 699.3]","64.9 [64.8, 64.9]","432.5 [432.0, 433.1]","541.0 [540.2, 541.8]","52.3 [52.2, 52.4]","268.8 [268.3, 269.3]","580.2 [577.3, 583.0]","62.6 [62.3, 62.9]","335.7 [334.4, 337.0]","175.2 [174.2, 176.3]","94.8 [94.5, 95.1]","653.8 [648.3, 659.2]"
NACC_XMAP,"298.7 [297.6, 299.8]","60.8 [60.2, 61.4]","94.5 [94.1, 95.0]","290.7 [289.2, 292.2]","40.1 [40.1, 40.2]","70.2 [70.1, 70.4]","248.5 [248.0, 249.0]","36.5 [36.4, 36.5]","54.0 [53.9, 54.1]","225.3 [225.0, 225.7]","40.3 [40.1, 40.4]","62.7 [62.3, 63.1]","23.6 [22.9, 24.2]","68.9 [68.6, 69.1]","99.3 [99.0, 99.6]"


In [60]:
pd.concat(results_percentage, axis=1)

Unnamed: 0_level_0,GMM,GMM,GMM,K-Means,K-Means,K-Means,Tertile,Tertile,Tertile,ROC,ROC,ROC,Mean ±2 SD,Mean ±2 SD,Mean ±2 SD
Unnamed: 0_level_1,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,0.12 %,0.42 %,0.31 %,0.13 %,0.17 %,0.17 %,0.32 %,0.33 %,0.31 %,0.58 %,0.56 %,0.5 %,0.92 %,0.35 %,0.29 %
EPAD,0.09 %,0.22 %,0.17 %,0.09 %,0.14 %,0.13 %,0.14 %,0.18 %,0.18 %,0.8 %,1.97 %,2.22 %,0.35 %,0.28 %,0.22 %
AIBL,1.01 %,1.56 %,1.62 %,0.53 %,1.18 %,1.8 %,,,,,,,,,
ARWIBO,0.94 %,5.78 %,1.2 %,0.53 %,0.47 %,1.0 %,,,,,,,,,
EDSD,1.02 %,1.7 %,1.93 %,1.0 %,1.05 %,1.1 %,,,,,,,,,
PREVENT-AD,3.17 %,1.61 %,2.61 %,0.27 %,0.59 %,0.91 %,0.23 %,0.34 %,0.33 %,,,,0.87 %,0.57 %,0.84 %
PharmaCog,0.3 %,0.4 %,3.82 %,0.45 %,1.12 %,0.64 %,,,,,,,,,
NACC_ELISA,0.34 %,1.3 %,2.83 %,0.26 %,0.45 %,0.5 %,0.41 %,0.36 %,0.59 %,0.96 %,0.76 %,1.19 %,0.8 %,0.73 %,0.61 %
EMIF_ELISA,0.16 %,0.3 %,0.59 %,0.46 %,0.18 %,0.25 %,0.3 %,0.26 %,0.37 %,0.99 %,0.98 %,0.8 %,1.17 %,0.7 %,1.67 %
NACC_XMAP,0.75 %,1.96 %,1.01 %,1.02 %,0.42 %,0.42 %,0.41 %,0.25 %,0.4 %,0.31 %,0.6 %,1.18 %,5.67 %,0.78 %,0.63 %


In [61]:
results_percentage['GMM']

Unnamed: 0,A-beta 1-42 in CSF,pTau in CSF,tTau in CSF
ADNI,0.12 %,0.42 %,0.31 %
EPAD,0.09 %,0.22 %,0.17 %
AIBL,1.01 %,1.56 %,1.62 %
ARWIBO,0.94 %,5.78 %,1.2 %
EDSD,1.02 %,1.7 %,1.93 %
PREVENT-AD,3.17 %,1.61 %,2.61 %
PharmaCog,0.3 %,0.4 %,3.82 %
NACC_ELISA,0.34 %,1.3 %,2.83 %
EMIF_ELISA,0.16 %,0.3 %,0.59 %
NACC_XMAP,0.75 %,1.96 %,1.01 %


In [13]:
round(596.25, 1)

596.2

In [14]:
import numpy as np
np.round(596.25, 1)

596.2