### Imports

In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
import os
import glob
import copy
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import math
from sklearn import metrics

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

### Read every cohort study file

In [2]:
datasets = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../cohort_studies_full_data/' + "/*."+'csv'))]
cohorts = [file.split(".")[0] for file in sorted(os.listdir('../cohort_studies_full_data/'))]

In [3]:
# make a dictionary that contains all cohorts as a dataframe
cohort_studies = dict()
# dfsss = dict()

for cohort, dataset in zip(cohorts, datasets):
    cohort_n = cohort.split("_MERGE")[0]
    cohort_studies[cohort_n] = dataset.loc[dataset['Months']==0].copy() # reduce to BL visit
#     dfsss[cohort_n] = dataset

In [4]:
datasets_sub = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../preprocessed_datasets/' + "/*."+'csv'))]
cohorts_sub = [file.split(".")[0] for file in sorted(os.listdir('../preprocessed_datasets/'))]

In [5]:
# make a dictionary that contains all cohorts as a dataframe
cohort_studies_sub = dict()

for cohort, dataset in zip(cohorts_sub, datasets_sub):
    cohort_studies_sub[cohort] = dataset.loc[dataset['Months']==0].copy() # reduce to BL visit

In [6]:
# add the preprocessed columns from sub table of each dataset to the main table of dataset
for i in cohort_studies:
#     cols = cohort_studies_sub[i].columns.difference(cohort_studies[i].columns)
    cols = ['Age', 'Sex', 'Education', 'APOE4', 'CDR', 'Race']
    
    for col in cols:
        
        if col in cohort_studies_sub[i].columns:
            cohort_studies[i][col] = cohort_studies_sub[i][col]

### Read harmonized mapping tables

In [7]:
modality = [pd.read_csv(file, sep=',') for file in sorted(glob.glob('../feature_tables' + "/*."+'csv'))]
name = [file.split(".")[0] for file in sorted(os.listdir('../feature_tables'))]

In [8]:
# make a dictionary that contains all modalities as a dataframe
mappings = dict()

for moda, na in zip(modality, name):
    mappings[na.split(' - ')[1]] = moda

In [9]:
harmonized_features = pd.concat(mappings, ignore_index=True) # combine all tables

In [10]:
# exclude categorical and taboo features
harmonized_features = harmonized_features.loc[(harmonized_features['Rank']!=1) & (harmonized_features['Rank']!=2)]

### Read the feature availability files for all cohorts

In [11]:
ava_mapp = [pd.read_csv(file, sep='\t') for file in sorted(glob.glob('../feature_availability_in_cohorts' + "/*."+'tsv'))]
tablesss = [file.split(".")[0] for file in sorted(os.listdir('../feature_availability_in_cohorts'))]

In [12]:
# make a dictionary that contains all modalities as a dataframe
available_features = dict()

for modal, df in zip(tablesss, ava_mapp):
    available_features[modal] = df

In [13]:
existing_features = pd.concat(available_features, ignore_index=True) # combine all tables

In [14]:
existing_features.replace({0: np.nan}, inplace=True) # 0 indicates that the feature was not measured 

### Read cutoffs obtained using all methods

In [15]:
table_method = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../results/cutoffs/' + "/*."+'csv'))]
method_name = [file.split(".")[0] for file in sorted(os.listdir('../results/cutoffs/'))]

In [16]:
# make a dictionary that contains each cutoff table as a dataframe
cutoffs_ = dict()

for tm, mn in zip(table_method, method_name):
    cutoffs_[mn] = tm

In [17]:
[cutoffs_['km_cutoffs'].rename(columns={col: col.split('_')[0]}, inplace=True) for col in cutoffs_['km_cutoffs'].columns]

[None, None, None]

### Selecetion of cohort studies for A/T/N assignment

### Select the patient that have CSF biomarker, disregard the diagnostic status

In [18]:
atn = pd.DataFrame(index=available_features['csf'].iloc[:3].replace({0: np.nan}).dropna(axis=1).columns[1:].to_list(), columns=mappings['csf'].Feature.loc[0:2].to_list()+(["Total"]))
# atn = pd.DataFrame(index=cohort_studies, columns=['A', 'T', 'N'])

In [19]:
for cohort in atn.index:
    for feat in mappings['csf'][cohort].loc[0:2].dropna().to_list():
        if feat in cohort_studies[cohort].columns:
            atn.loc[cohort, mappings['csf'].loc[mappings['csf'][cohort]==feat, 'Feature']] = len(cohort_studies[cohort][feat].dropna())
            atn.loc[cohort, 'Total'] = len(cohort_studies[cohort][mappings['csf'][cohort].loc[0:2].dropna().to_list()].dropna())

In [20]:
# atn

In [21]:
diag = pd.DataFrame(index=available_features['csf'].iloc[:3].replace({0: np.nan}).dropna(axis=1).columns[1:].to_list(), columns=cohort_studies['ADNI']['Diagnosis'].dropna().unique())

In [22]:
for cohort in diag.index:
    for dia in diag.columns:
        diag.loc[cohort, dia] = len(cohort_studies[cohort].loc[cohort_studies[cohort]['Diagnosis']==dia][mappings['csf'][cohort].loc[0:2].dropna().to_list()].dropna())

### Remove the empty columns from all cohorts that we are intrested in
### Remove the participant without all 3 CSF biomarkers

In [23]:
selected_cohorts = dict()

for coh in diag.index:
    selected_cohorts[coh] = cohort_studies[coh].dropna(axis=1, how='all')

In [24]:
total_feats = dict()

# existing_features.set_index('Feature', inplace=True)

for feat in existing_features.Feature:
    total_feats[feat] = existing_features.loc[existing_features.Feature==feat][selected_cohorts].dropna(axis=1).columns

In [25]:
for cohort in atn.index:
    feat = mappings['csf'][cohort].loc[0:2].dropna().to_list()
    cohort_studies[cohort] = cohort_studies[cohort].dropna(subset=feat)

As Some features have suffix due to merging tables for certain cohorts, first investigate if all the harmonized features are in cohorts. Rename the ones that have suffix so it can be compatible to work with our harmonized names.

In [26]:
cohort_studies['ADNI'] = cohort_studies['ADNI'].rename(columns={'PTEDUCAT_x': 'PTEDUCAT', 'TRABSCOR_bl': 'TRABSCOR', 'LDELTOTAL_BL': 'LDELTOTAL'})

### CSF biomarkers, two classes, normal vs abnormal

In [27]:
# modality_ = ['clinical_i', 'clinical_ii','hippocampus', 'csf']
# modality_ = ['basal_ganglia', 'brain_poles_volume', 'cerebellum', 'clinical_i', 
#             'clinical_ii', 'csf', 'diencephalus', 'general_brain', 
#             'mri_others', 'pet', 'plasma', 'ventricles', 'hippocampus']
modality_ = ['basal_ganglia', 'brain_poles_volume', 'cerebellum', 'csf', 'diencephalus', 'general_brain', 
            'mri_others', 'pet', 'plasma', 'ventricles', 'hippocampus']

selected_feat = dict()

for i in modality_:
    selected_feat[i] = mappings[i]

In [28]:
# make a dataframe containing all the mapped features
features_all = pd.concat(selected_feat, ignore_index=True)

In [29]:
features_all = features_all[atn.index.union(['Feature', 'Rank'])] # subset the cohorts of interest

In [30]:
# remove the features that are not available in all studies
features_all = features_all.loc[features_all['Feature'].isin(existing_features[atn.index.union(['Feature'])].dropna(how='all')['Feature'].to_list())]

### Convert MRI measurements to mm3, same as ADNI and other cohorts

In [31]:
mri_nacc = ['basal_ganglia', 'brain_poles_volume', 'cerebellum', 'diencephalus', 'general_brain', 
            'mri_others', 'pet', 'plasma', 'ventricles', 'hippocampus']

for i in mri_nacc:
    
    for variable in mappings[i][['Feature', 'NACC']].dropna()['Feature'].to_list():
        
        if "Volume" in variable:
            nacc_var = mappings[i].loc[mappings[i]['Feature']==variable, 'NACC']
            cohort_studies['NACC'][nacc_var] = cohort_studies['NACC'][nacc_var] * 1000

In [32]:
nacc_mri_to_convert = list()

for i in mri_nacc:
    
    for feat_ in mappings[i]['NACC'].dropna().to_list():
        nacc_mri_to_convert.append(feat_)

In [33]:
# cohort_studies['NACC'][nacc_mri_to_convert].dropna(how='all')

* Rank 1 --> Categorical features
* Rank 2 --> Taboo features: some categorical and some numerical
* Rank nan --> Numerical features 

replace nan with 0

In [34]:
features_all['Rank'].replace({np.nan: 0}, inplace=True)

In [35]:
features_all.replace({"No total score.": np.nan}, inplace=True)

In [36]:
# fix PharmaCog column names
for i in cohort_studies['PharmaCog'].columns:
    if "\xa0" in i:
        new = str(i).replace(u'\xa0', u'')
        cohort_studies['PharmaCog'].rename(columns={i: new}, inplace=True)

## Prepare datasets

In [37]:
def select_atn_participants(dfss, cohorts, thresholds, features):
    """cohorts: list of cohort names 
       dfss: dictionary of cohorts where each key is the name of a cohort
       thresholds: cutoff values obtained using a methodology
       
       Select the features and participants and categorize the participant into ATN profiles using thresholds
       obtained from each methodology.
       
       return a df which contain the combination of paticipant from the selected cohorts while harmonizing
       the features names.
    """
    
    # make a list of additional features to be investigated 
#     additional_feat = ['Age', 'Sex', 'Education', 'APOE4', 'CDR', 'Race']
    additional_feat = ['APOE4']

    # make an empty dictionary to add the datasets to
    df_= pd.DataFrame(columns=set(features.loc[features['Rank']==0]
                                  [cohorts + ['Feature']].dropna()['Feature']).difference(['Feature'] + list(thresholds.columns)).union(additional_feat))
   
    for i in cohorts:
#         print(i)

        dfs = dict()
        dfs[i] = dfss[i].copy() # make a copy of the dataset of interest
        # select the subset of datasets with features to be investigated
        dfs[i] = dfs[i][features.loc[features['Rank']==0][cohorts + ['Feature']].dropna()[i].to_list() + additional_feat].dropna(axis=1, how='all')
        # rename all the columns so we can concat the datasets later
        [dfs[i].rename(columns={col: coln}, inplace=True) for col, coln in 
         zip(features.loc[features['Rank']==0][cohorts + ['Feature']].dropna()[i].to_list(), 
             features.loc[features['Rank']==0][cohorts + ['Feature']].dropna()['Feature'].to_list())]
        dfs[i]['Cohort'] = i # add a cohort name column
        # change the datatype to str as these are categorical features. astype doesn't work as it will include nan values
        dfs[i] = dfs[i].replace({'APOE4': {0.0: '0', 2.0: '2', 1.0: '1'}}) 

        if i!='NACC':
            
            for biomarker in thresholds.columns:
                # select the cutoff value for each biomarker for each cohort
                threshold = thresholds.loc[i][biomarker]

                # dichotomize the participants
                if biomarker == 'pTau in CSF': 
                    dfs[i].loc[dfs[i][biomarker]>threshold, "T"] = 'T+'
                    dfs[i].loc[dfs[i][biomarker]<threshold, "T"] = 'T-'

                elif biomarker == 'tTau in CSF': 
                    dfs[i].loc[dfs[i][biomarker]>threshold, "N"] = 'N+'
                    dfs[i].loc[dfs[i][biomarker]<threshold, "N"] = 'N-'

                else: 
                    dfs[i].loc[dfs[i][biomarker]<threshold, "A"] = 'A+'
                    dfs[i].loc[dfs[i][biomarker]>threshold, "A"] = 'A-'

            # join the 3 columns to make the final ATN categorie                                 
            dfs[i]['ATN'] = dfs[i]['A'] + dfs[i]['T'] + dfs[i]['N']
            # remove the columns that we are not interested in
            dfs[i] = dfs[i][dfs[i].columns.difference(['A', 'T', 'N', 'Mini-Mental State Examination (MMSE)'] + list(thresholds.columns))]
            
        else:
            
            elisa_index = cohort_studies[i].loc[(cohort_studies[i]['CSFTTMD']==1)].index
            xmap_index = cohort_studies[i].loc[(cohort_studies[i]['CSFTTMD']==2)].index
            
            for biomarker in thresholds.columns:
                # select the cutoff value for each biomarker for each cohort
                elisa = thresholds.loc[i + "_ELISA"][biomarker] # ELISA
                xmap = thresholds.loc[i + "_XMAP"][biomarker] #XMAP

                # dichotomize the participants
                if biomarker == 'pTau in CSF': 
                    
                    dfs[i].loc[(dfs[i].index.isin(elisa_index)) & (dfs[i][biomarker]>elisa), "T"] = 'T+'
                    dfs[i].loc[(dfs[i].index.isin(elisa_index)) & (dfs[i][biomarker]<elisa), "T"] = 'T-'
                    dfs[i].loc[(dfs[i].index.isin(xmap_index)) & (dfs[i][biomarker]>xmap), "T"] = 'T+'
                    dfs[i].loc[(dfs[i].index.isin(xmap_index)) & (dfs[i][biomarker]<xmap), "T"] = 'T-'

                elif biomarker == 'tTau in CSF': 
                    dfs[i].loc[(dfs[i].index.isin(elisa_index)) & (dfs[i][biomarker]>elisa), "N"] = 'N+'
                    dfs[i].loc[(dfs[i].index.isin(elisa_index)) & (dfs[i][biomarker]<elisa), "N"] = 'N-'
                    dfs[i].loc[(dfs[i].index.isin(xmap_index)) & (dfs[i][biomarker]>xmap), "N"] = 'N+'
                    dfs[i].loc[(dfs[i].index.isin(xmap_index)) & (dfs[i][biomarker]<xmap), "N"] = 'N-'

                else: 
                    dfs[i].loc[(dfs[i].index.isin(elisa_index)) & (dfs[i][biomarker]<elisa), "A"] = 'A+'
                    dfs[i].loc[(dfs[i].index.isin(elisa_index)) & (dfs[i][biomarker]>elisa), "A"] = 'A-'
                    dfs[i].loc[(dfs[i].index.isin(xmap_index)) & (dfs[i][biomarker]<xmap), "A"] = 'A+'
                    dfs[i].loc[(dfs[i].index.isin(xmap_index)) & (dfs[i][biomarker]>xmap), "A"] = 'A-'

            # join the 3 columns to make the final ATN categorie                                 
            dfs[i]['ATN'] = dfs[i]['A'] + dfs[i]['T'] + dfs[i]['N']
            # remove the columns that we are not interested in
            dfs[i] = dfs[i][dfs[i].columns.difference(['A', 'T', 'N', 'Mini-Mental State Examination (MMSE)', 'Trail Making Test (TMT) A', 'Verbal fluency tests (Semantic) Animal'] + list(thresholds.columns))]
            
#         print(dfs[i])

        
        df_ = pd.concat([df_, dfs[i]])
#         print(df_)
#         df_ = df_.dropna(axis=1, thresh=len(df_.index)/2)
#         print(df_)
    
    df_ = df_.dropna(axis=1, how='all')
    df_ = df_.dropna(axis=1, thresh=1200)

    df_ = df_.dropna()

    return df_

In [38]:
clustering_df_km = select_atn_participants(cohort_studies, ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog'], cutoffs_['km_cutoffs'], features_all)
clustering_df_gmm = select_atn_participants(cohort_studies, ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog'], cutoffs_['gmm_cutoffs'], features_all)

#### exclude CSF Volume as it seems wrong in the NACC dataset and that could potentially add bias to our analysis

In [39]:
clustering_df_km = clustering_df_km[clustering_df_km.columns.difference(['Cerebrospinal Fluid Volume'])]
clustering_df_gmm = clustering_df_gmm[clustering_df_gmm.columns.difference(['Cerebrospinal Fluid Volume'])]

In [42]:
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [43]:
def atn_based_df(df):
    """"""
    
    labels_atn = dict()
    score_atn = dict()
    
    # change the cohort names to int and write in a column
    for i, j in zip(df.Cohort.unique(), range(len(df.Cohort.unique()))):
        df.loc[df['Cohort']==i, 'Cohort_number'] = j
    
    # select the profiles that have over 50 participants
    # make a dictionary where the selcted ATN profiles are key and 
    # the subset of dataframe categorized in that ATN profile is value
    dfs_sub = {atn: df.loc[df.ATN==atn].copy() for atn in list((a) for a,b in dict(Counter(df.ATN)).items() if b >30)}
    
    for i in dfs_sub:
#         print(len(dfs_sub[i].Cohort.unique()))
        hierarchical_cluster = AgglomerativeClustering(n_clusters=len(dfs_sub[i].Cohort.unique()), affinity='euclidean', linkage='ward')
        labels = hierarchical_cluster.fit_predict(dfs_sub[i][dfs_sub[i].columns.difference(['ATN', 'Cohort'])])
        labels_atn[i] = labels
        score_atn[i] = purity_score(dfs_sub[i]['Cohort_number'], labels)
     
    return labels_atn, score_atn, dfs_sub
    

In [44]:
labels_km, score_km, atn_km_df = atn_based_df(clustering_df_km)
labels_gmm, score_gmm, atn_gmm_df = atn_based_df(clustering_df_gmm)

In [47]:
print("Cluster purity (K-means):")
score_km

Cluster purity (K-means):


{'A-T-N-': 0.43386243386243384,
 'A+T-N-': 0.46534653465346537,
 'A+T+N+': 0.5637393767705382,
 'A-T+N+': 0.4918032786885246,
 'A+T+N-': 0.5084745762711864,
 'A+T-N+': 0.5897435897435898}

In [48]:
print("Cluster purity (GMM):")
score_gmm

Cluster purity (GMM):


{'A-T-N-': 0.4213075060532688,
 'A+T-N-': 0.4943820224719101,
 'A+T+N+': 0.5387931034482759,
 'A+T+N-': 0.6060606060606061,
 'A+T-N+': 0.43283582089552236}

In [49]:
for i in labels_km:
    atn_km_df[i].loc[:, 'predicted_cohort'] = list(labels_km[i])

for i_ in labels_gmm:
    atn_gmm_df[i_].loc[:, 'predicted_cohort'] = list(labels_gmm[i_])

In [50]:
def calculate_cramer(labels_, dfs):
    """ """
    results = dict()
    
    for i in labels_:
        mat = pd.crosstab(dfs[i]['Cohort_number'], dfs[i]['predicted_cohort'])
        results[i] = round(stats.contingency.association(mat, method='cramer'), 2)
        
    return results

In [51]:
pd.DataFrame.from_dict(calculate_cramer(labels_km, atn_km_df), orient='index').transpose()

Unnamed: 0,A-T-N-,A+T-N-,A+T+N+,A-T+N+,A+T+N-,A+T-N+
0,0.16,0.2,0.26,0.3,0.35,0.4


In [52]:
pd.DataFrame.from_dict(calculate_cramer(labels_gmm, atn_gmm_df), orient='index').transpose()

Unnamed: 0,A-T-N-,A+T-N-,A+T+N+,A+T+N-,A+T-N+
0,0.19,0.18,0.26,0.66,0.3


In [53]:
def count_clusters(df):
    
    """df: dataframe with ATN categorization using certain method and containing cluster labels
       return: the number of participant within each cohort and within each ATN profile assigned to each labels"""
    
    # make an empty dictionary of dataframes to store the results
    clustering_result_ = {i: pd.DataFrame(index=df[i].Cohort.unique(), 
                                            columns=sorted(df[i].predicted_cohort.unique())) for i in df}
    
    # check the number of participants clustered to each coohort within each biomarker profile
    for profi in df:
    
        for name, numb in zip(df[profi]['Cohort'].unique(), df[profi]['Cohort_number'].unique()):
            clustering_result_[profi].loc[name] = Counter(df[profi].loc[df[profi]['Cohort']==name, 'predicted_cohort'])
            
            
    # replace all nan enteries with 0
    [clustering_result_[i].replace({np.nan: 0}, inplace=True) for i in clustering_result_]
    
    # change enteries to integer
    for i in clustering_result_:
        clustering_result_[i] = clustering_result_[i].astype(int)

    return clustering_result_

In [54]:
clustering_result_km = count_clusters(atn_km_df)
clustering_result_gmm = count_clusters(atn_gmm_df)

In [55]:
print("K-Means")
for i in clustering_result_km.keys():
    print(len(clustering_result_km[i].index), list(clustering_result_km[i].index))
    
print("GMM")
for i in clustering_result_gmm.keys():
    print(len(clustering_result_gmm[i].index), list(clustering_result_gmm[i].index))

K-Means
7 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog']
7 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog']
7 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog']
6 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'DOD-ADNI', 'PharmaCog']
7 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog']
6 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'PharmaCog']
GMM
7 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog']
7 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog']
7 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'DOD-ADNI', 'PharmaCog']
5 ['ADNI', 'EDSD', 'NACC', 'JADNI', 'PharmaCog']
6 ['ADNI', 'EDSD', 'ARWIBO', 'NACC', 'JADNI', 'PharmaCog']


In [58]:
clustering_result_km.keys()

dict_keys(['A-T-N-', 'A+T-N-', 'A+T+N+', 'A-T+N+', 'A+T+N-', 'A+T-N+'])

In [56]:
clustering_result_gmm.keys()

dict_keys(['A-T-N-', 'A+T-N-', 'A+T+N+', 'A+T+N-', 'A+T-N+'])

In [78]:
clustering_result_gmm['A+T-N+']

Unnamed: 0,0,1,2,3,4,5
ADNI,1,2,1,2,2,0
EDSD,0,2,0,3,2,1
ARWIBO,7,1,1,1,4,0
NACC,7,3,7,7,3,0
JADNI,1,3,0,0,1,0
PharmaCog,0,2,2,1,0,0
