### Imports

In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
import os
import glob
import copy
import seaborn as sns
import matplotlib.pyplot as plt

### Read every cohort study file

In [2]:
datasets = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../cohort_studies_full_data/' + "/*."+'csv'))]
cohorts = [file.split(".")[0] for file in sorted(os.listdir('../cohort_studies_full_data/'))]

In [3]:
# make a dictionary that contains all cohorts as a dataframe
cohort_studies = dict()
# dfsss = dict()

for cohort, dataset in zip(cohorts, datasets):
    cohort_n = cohort.split("_MERGE")[0]
    cohort_studies[cohort_n] = dataset.loc[dataset['Months']==0].copy() # reduce to BL visit
#     dfsss[cohort_n] = dataset

In [4]:
datasets_sub = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../preprocessed_datasets/' + "/*."+'csv'))]
cohorts_sub = [file.split(".")[0] for file in sorted(os.listdir('../preprocessed_datasets/'))]

In [5]:
# make a dictionary that contains all cohorts as a dataframe
cohort_studies_sub = dict()

for cohort, dataset in zip(cohorts_sub, datasets_sub):
    cohort_studies_sub[cohort] = dataset.loc[dataset['Months']==0].copy() # reduce to BL visit

In [6]:
for i in cohort_studies:
    cohort_studies[i]['Age']=cohort_studies_sub[i]['Age']

### Read harmonized mapping tables

In [7]:
modality = [pd.read_csv(file, sep=',') for file in sorted(glob.glob('../feature_tables' + "/*."+'csv'))]
name = [file.split(".")[0] for file in sorted(os.listdir('../feature_tables'))]

In [8]:
# make a dictionary that contains all modalities as a dataframe
mappings = dict()

for moda, na in zip(modality, name):
    mappings[na.split(' - ')[1]] = moda

In [9]:
harmonized_features = pd.concat(mappings, ignore_index=True) # combine all tables

In [10]:
# exclude categorical and taboo features
harmonized_features = harmonized_features.loc[(harmonized_features['Rank']!=1) & (harmonized_features['Rank']!=2)]

### Read the feature availability files for all cohorts

In [11]:
ava_mapp = [pd.read_csv(file, sep='\t') for file in sorted(glob.glob('../feature_availability_in_cohorts' + "/*."+'tsv'))]
tablesss = [file.split(".")[0] for file in sorted(os.listdir('../feature_availability_in_cohorts'))]

In [12]:
# make a dictionary that contains all modalities as a dataframe
available_features = dict()

for modal, df in zip(tablesss, ava_mapp):
    available_features[modal] = df

In [13]:
existing_features = pd.concat(available_features, ignore_index=True) # combine all tables

In [14]:
existing_features.replace({0: np.nan}, inplace=True) # 0 indicates that the feature was not measured 

### Selecetion of cohort studies for A/T/N assignment

### Select the patient that have CSF biomarker, disregard the diagnostic status

In [15]:
atn = pd.DataFrame(index=available_features['csf'].iloc[:3].replace({0: np.nan}).dropna(axis=1).columns[1:].to_list(), columns=mappings['csf'].Feature.loc[0:2].to_list()+(["Total"]))
# atn = pd.DataFrame(index=cohort_studies, columns=['A', 'T', 'N'])

In [16]:
for cohort in atn.index:
    for feat in mappings['csf'][cohort].loc[0:2].dropna().to_list():
        if feat in cohort_studies[cohort].columns:
            atn.loc[cohort, mappings['csf'].loc[mappings['csf'][cohort]==feat, 'Feature']] = len(cohort_studies[cohort][feat].dropna())
            atn.loc[cohort, 'Total'] = len(cohort_studies[cohort][mappings['csf'][cohort].loc[0:2].dropna().to_list()].dropna())

In [17]:
# atn

In [18]:
diag = pd.DataFrame(index=available_features['csf'].iloc[:3].replace({0: np.nan}).dropna(axis=1).columns[1:].to_list(), columns=cohort_studies['ADNI']['Diagnosis'].dropna().unique())

In [19]:
for cohort in diag.index:
    for dia in diag.columns:
        diag.loc[cohort, dia] = len(cohort_studies[cohort].loc[cohort_studies[cohort]['Diagnosis']==dia][mappings['csf'][cohort].loc[0:2].dropna().to_list()].dropna())

In [20]:
# diag

### Remove the empty columns from all cohorts that we are intrested in
### Remove the participant without all 3 CSF biomarkers

In [21]:
selected_cohorts = dict()

for coh in diag.index:
    selected_cohorts[coh] = cohort_studies[coh].dropna(axis=1, how='all')

In [22]:
total_feats = dict()

# existing_features.set_index('Feature', inplace=True)

for feat in existing_features.Feature:
    total_feats[feat] = existing_features.loc[existing_features.Feature==feat][selected_cohorts].dropna(axis=1).columns

In [23]:
for cohort in atn.index:
    feat = mappings['csf'][cohort].loc[0:2].dropna().to_list()
    cohort_studies[cohort] = cohort_studies[cohort].dropna(subset=feat)

As Some features have suffix due to merging tables for certain cohorts, first investigate if all the harmonized features are in cohorts. Rename the ones that have suffix so it can be compatible to work with our harmonized names.

In [24]:
cohort_studies['ADNI'].rename(columns={'PTEDUCAT_x': 'PTEDUCAT', 'TRABSCOR_bl': 'TRABSCOR'}, inplace=True)

# Plot the CSF biomarkers in different cohorts

In [25]:
# fig, axes = plt.subplots(11, 3, sharex=False, figsize=(20, 35))
# fig.subplots_adjust(hspace=0.7)
# i=0

# for ind in atn.index:
#     csf = mappings['csf'].iloc[:3][ind].to_list()
#     colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
    
#     for bioma, colo in zip(csf, colors):
# #         print(bioma, csf.index(bioma))
#         sns.histplot(cohort_studies[ind][bioma].dropna(), ax=axes[i, csf.index(bioma)], color=colo)
# #         axes[i, csf.index(bioma)].set_title(bioma.upper())
#         axes[i, 1].set_title(ind, fontsize=16, pad=10)
    
#     i+=1
    
# # plt.savefig("csf_biomarkers.png", dpi=300)

In [26]:
# fig, axes = plt.subplots(11, 3, sharex=False, figsize=(20, 35))
# fig.subplots_adjust(hspace=0.7)
# i=0

# for ind in atn.index:
#     csf = mappings['csf'].iloc[:3][ind].to_list()
#     colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
    
#     for bioma, colo in zip(csf, colors):
# #         print(bioma, csf.index(bioma))
#         sns.scatterplot(x=cohort_studies[ind]['Age'], y=cohort_studies[ind][bioma].dropna(), ax=axes[i, csf.index(bioma)], color=colo)
# #         axes[i, csf.index(bioma)].set_title(bioma.upper())
#         axes[i, 1].set_title(ind, fontsize=16, pad=10)
    
#     i+=1
    
# # plt.savefig("csf_biomarkers.png", dpi=300)

### CSF biomarkers, two classes, normal vs abnormal

### subset each cohort dataset based on the columns of interest for clustering 

In [27]:
cohorts_csf = dict()

for i in atn.index:
    
    if len(cohort_studies[i].loc[cohort_studies[i]['Diagnosis']=='CU'].index)>2:
        csf = mappings['csf'].iloc[:3][i].to_list()
    
        if i == 'NACC':
            cohorts_csf['NACC_ELISA'] = cohort_studies[i].loc[cohort_studies[i]['CSFTTMD']==1][csf + ["Diagnosis", "Age"]] # ELISA
            cohorts_csf['NACC_XMAP'] = cohort_studies[i].loc[cohort_studies[i]['CSFTTMD']==2][csf + ["Diagnosis", "Age"]] # xmap
            cohorts_csf['NACC_ELISA'] = cohorts_csf['NACC_ELISA'].dropna(subset=cohorts_csf['NACC_ELISA'].columns[:3].to_list() + ['Age']) # drop empty rows (CSF biomarkers)
            cohorts_csf['NACC_XMAP'] = cohorts_csf['NACC_XMAP'].dropna(subset=cohorts_csf['NACC_XMAP'].columns[:3].to_list() + ['Age']) # drop empty rows (CSF biomarkers)


        elif i == 'EMIF':
            cohorts_csf['EMIF_ELISA'] = cohort_studies[i].loc[~(cohort_studies[i]['Studyname'].isin(['EDAR', 'Leuven', ]))][csf + ["Diagnosis", "Age"]] # INNOTEST ELISA
            cohorts_csf['EMIF_XMAP'] = cohort_studies[i].loc[(cohort_studies[i]['Studyname'].isin(['EDAR', 'Leuven', ]))][csf + ["Diagnosis", "Age"]] # xmap and not collected
            cohorts_csf['EMIF_ELISA'] = cohorts_csf['EMIF_ELISA'].dropna(subset=cohorts_csf['EMIF_ELISA'].columns[:3].to_list() + ['Age']) # drop empty rows (CSF biomarkers)
            cohorts_csf['EMIF_XMAP'] = cohorts_csf['EMIF_XMAP'].dropna(subset=cohorts_csf['EMIF_XMAP'].columns[:3].to_list() + ['Age']) # drop empty rows (CSF biomarkers)

        else: 
            cohorts_csf[i] = cohort_studies[i][csf + ["Diagnosis", "Age"]]
            cohorts_csf[i] = cohorts_csf[i].dropna(subset=cohorts_csf[i].columns[:3].to_list() + ['Age']) # drop empty rows (CSF biomarkers)

In [28]:
cohorts_cu = dict()

for i in cohorts_csf:
    cohorts_cu[i] = cohorts_csf[i].loc[cohorts_csf[i]['Diagnosis']=='CU']

In [29]:
cohorts_cu.keys()

dict_keys(['ADNI', 'DOD-ADNI', 'EMIF_ELISA', 'EMIF_XMAP', 'EPAD', 'JADNI', 'NACC_ELISA', 'NACC_XMAP', 'PREVENT-AD'])

# Bootstrap, scale features, train model, extract cutoffs and categorize the participants

In [30]:
def tertile_cutoffs(dfs, csfs, result):
    """dfs: a dictionary containing dataframes of all cohort studies.
       csfs: a dictionary of mapping with "csf" as the key for the dataframe of mapped csf biomarkers
       result: dataframe where cohort names are indices and csf biomarkers are columns
       output: calculate the cutoff values based on tertile where, the cutoff for abeta is first interval and 
       for ttau and ptau the cutoff is the second interval value.
    """
    csf_bioma = csfs['csf'].iloc[:3]['Feature'].to_list()
    
    for coh_name in dfs:
        
        if '_' not in coh_name:

            for biomark in csf_bioma:

                if biomark =='A-beta 1-42 in CSF':
                    columnn = csfs['csf'].loc[csfs['csf']['Feature']==biomark][coh_name].item()
                    result.loc[coh_name, biomark] = [round(x, 2) for x in list(pd.qcut(dfs[coh_name][columnn], 3, retbins=True)[1][1:3])][0]

                else:
                    columnn = csfs['csf'].loc[csfs['csf']['Feature']==biomark][coh_name].item()
                    result.loc[coh_name, biomark] = [round(x, 2) for x in list(pd.qcut(dfs[coh_name][columnn], 3, retbins=True)[1][1:3])][1]       
                    
        else:

            for biomark in csf_bioma:

                if biomark =='A-beta 1-42 in CSF':
                    columnn = csfs['csf'].loc[csfs['csf']['Feature']==biomark][coh_name.split('_')[0]].item()
                    result.loc[coh_name, biomark] = [round(x, 2) for x in list(pd.qcut(dfs[coh_name][columnn], 3, retbins=True)[1][1:3])][0]

                else:
                    columnn = csfs['csf'].loc[csfs['csf']['Feature']==biomark][coh_name.split('_')[0]].item()
                    result.loc[coh_name, biomark] = [round(x, 2) for x in list(pd.qcut(dfs[coh_name][columnn], 3, retbins=True)[1][1:3])][1]

In [31]:
def cutoffs_from_bootstrap_dfs(all_cohorts_cu, all_cohorts, iteration_):
    
    for boot in range(iteration_):
        bootstraped_dfs = dict()
        bio_list = list()
        cutpointssss = pd.DataFrame(index=all_cohorts_cu.keys(), columns=mappings['csf'].iloc[:3]['Feature'].to_list())
        
        # Sample from the data with replacement
        for study in all_cohorts:
            bootstraped_dfs[study] = all_cohorts_cu[study].sample(n=len(all_cohorts_cu[study].index), replace=True)
            bootstraped_dfs[study].to_csv(f"../results/bootstrap/tertile/datasets/{study}_{boot}.csv") #save sampled
            
        tertile_cutoffs(bootstraped_dfs, mappings, cutpointssss)
        
        # Rearrange the columns to A T N 
        cutpointssss = cutpointssss[['A-beta 1-42 in CSF', 'pTau in CSF', 'tTau in CSF']]
        cutpointssss = cutpointssss.loc[['ADNI', 'EPAD', 'PREVENT-AD', 'NACC_ELISA', 'EMIF_ELISA', 'NACC_XMAP', 'EMIF_XMAP', 'DOD-ADNI', 'JADNI']]
        cutpointssss.to_csv(f"../results/bootstrap/tertile/cutoffs/tertile_cutoffs_{boot}.csv")
        
        for a in ['A', 'T', 'N']: 
            for b in ['+', '-']:
                bio_list.append(a+b)

        profiles_df = pd.DataFrame(index=cutpointssss.index, columns=bio_list)

        classes = {i: pd.DataFrame(index=all_cohorts[i].index, columns=['A', 'T', 'N']) for i in cutpointssss.index}
        for i in classes: classes[i].replace({np.nan: 0}, inplace=True)
            
        for ind in cutpointssss.index:

            if "_" not in ind:

                for col, letter in zip(cutpointssss.columns, ['A', 'T', 'N']):
                    threshold = cutpointssss.loc[ind][col]
                    bio = mappings['csf'].loc[mappings['csf']['Feature']==col, ind].item()

                    if letter == 'T':
                        classes[ind].loc[all_cohorts[ind].loc[all_cohorts[ind][bio]>threshold].index, "T"] = 1
                    
                    elif letter == 'N':
                        classes[ind].loc[all_cohorts[ind].loc[all_cohorts[ind][bio]>threshold].index, "N"] = 1
                    
                    else:
                        classes[ind].loc[all_cohorts[ind].loc[all_cohorts[ind][bio]<threshold].index, "A"] = 1

            else:

                for col, letter in zip(cutpointssss.columns, ['A', 'T', 'N']):
                    threshold = cutpointssss.loc[ind][col]
                    bio = mappings['csf'].loc[mappings['csf']['Feature']==col, ind.split("_")[0]].item()

                    if letter == 'T':
                        classes[ind].loc[all_cohorts[ind].loc[all_cohorts[ind][bio]>threshold].index, "T"] = 1
                    
                    elif letter == 'N':
                        classes[ind].loc[all_cohorts[ind].loc[all_cohorts[ind][bio]>threshold].index, "N"] = 1
                    
                    else:
                        classes[ind].loc[all_cohorts[ind].loc[all_cohorts[ind][bio]<threshold].index, "A"] = 1


        for i in classes:
            classes[i]['ATN'] = classes[i]['A'].astype(str) + classes[i]['T'].astype(str) + classes[i]['N'].astype(str)

        final_profiles = pd.DataFrame(index=classes, columns=list(Counter(classes['ADNI']['ATN']).keys()))
        final_profiles.replace({np.nan: 0}, inplace=True)
        for i in classes:
            profs = dict(Counter(classes[i]['ATN']))

            for pro in profs:
                final_profiles.loc[i, pro] = profs[pro]

        final_profiles.rename(columns={'000': "A-T-N-", '100': 'A+T-N-', '111': 'A+T+N+', '110': 'A+T+N-', 
                                       '011': "A-T+N+", '101': "A+T-N+", '001': 'A-T-N+', '010': 'A-T+N-'}, inplace=True)
        final_profiles = final_profiles[['A-T-N-', 'A-T+N+', 'A-T-N+', 'A-T+N-', 'A+T+N-', 'A+T-N-', 'A+T-N+', 'A+T+N+']]
        final_profiles.replace({np.nan: 0}, inplace=True)
        final_profiles.loc['NACC'] = final_profiles.loc['NACC_ELISA'] + final_profiles.loc['NACC_XMAP']
        final_profiles.loc['EMIF'] = final_profiles.loc['EMIF_ELISA'] + final_profiles.loc['EMIF_XMAP'] 
        final_profiles = final_profiles.loc[['ADNI', 'EPAD', 'PREVENT-AD', 'NACC', 'EMIF', 'DOD-ADNI', 'JADNI']]
        final_profiles.to_csv(f"../results/bootstrap/tertile/profiles/final_profiles_tertile_{boot}.csv")


In [32]:
cutoffs_from_bootstrap_dfs(cohorts_cu, cohorts_csf, iteration_=1000)