## Imports 

In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
from statsmodels.distributions.empirical_distribution import ECDF
import os
import glob
import copy

## Opening the CSV files 

In [2]:
dataframes = [pd.read_csv(file, sep=',', index_col=0) for file in sorted(glob.glob('../preprocessed_datasets' + "/*."+'csv'))]
cohorts = [file.strip(".csv") for file in sorted(os.listdir('../preprocessed_datasets'))]

In [3]:
# reduce to BL visit and CU participants only
all_cohorts = dict()
for name, df in zip(cohorts, dataframes):
    all_cohorts[name] = df.loc[(df["Visit"]==1) & (df["Diagnosis"].astype(str)=='CU')]
    
#all_cohorts_ctl = copy.deepcopy(all_cohorts)
all_cohorts_ctl = dict()
for name, df in zip(cohorts, dataframes):
    all_cohorts_ctl[name] = df.loc[(df["Visit"]==1) & (df["Diagnosis"].astype(str)=='CU')]   

## Functions to perform essential calculations 

In [4]:
def cat_stat_df(dfs, result):
    """Counting different categories, calculate the % of categorical features, store results in a df"""
    
    categorical = {'APOE4': [2.0, 1.0], 'Sex': ['Female'], 'Diagnosis': ['CU', 'MCI', 'AD']}
    column_cat = ['Sex', 'Diagnosis', 'APOE4']

    for cohort in dfs:
        
        if dfs[cohort].empty==False:
            calc_dict = dict()
            df = dfs[cohort]

            for col in column_cat:
                ca = Counter(df[col].dropna())
                calc_dict[col] = ca

            cohort_df = pd.DataFrame(calc_dict).transpose()
            cohort_df = cohort_df.dropna(how='all')
            cohort_df.loc[cohort] = cohort_df.sum()
           
            for i in categorical:
                
                if i == 'Diagnosis':
                    
                    if i in cohort_df.index: 
                        result.loc[cohort, categorical[i]] = cohort_df.loc[cohort, cohort_df.loc[i].notna()].astype(int)
                        result.loc[cohort, categorical[i]] = result.loc[cohort, categorical[i]].replace({np.nan: 0})
                        result.loc[cohort, 'n'] = int(sum(cohort_df.loc[cohort, cohort_df.loc[i].notna()]))
                    
                    else:
                        result.loc[cohort, i] = np.nan
                        result.loc[cohort, 'n'] = int(len(dfs[cohort].index))
                
                elif i == 'APOE4':
                    
                    if 'APOE4' in list(cohort_df.index.astype(str)):
                        
                        if '2.0' not in list(cohort_df.columns.astype(str)) and '2' not in list(cohort_df.columns.astype(str)):
                            cohort_df[2.0] = np.nan
                        
                        result.loc[cohort, i] = round(100 * sum([val for val in cohort_df.loc[i, categorical[i]]]) / 
                                                     sum([val for val in cohort_df.loc[i].dropna()]), 1)
                    
                    else:
                        result.loc[cohort, i] = np.nan
                
                elif i == 'Sex':
                    
                    if (i in cohort_df.index) & ("Female" in cohort_df.columns):
                        result.loc[cohort, i] = round(100 * sum([val for val in cohort_df.loc[i, categorical[i]]]) 
                                                      / sum([val for val in cohort_df.loc[i].dropna()]), 1)
                    else:
                        result.loc[cohort, i] = 0
                        
        result.loc[cohort, 'Total'] = int(len(dfs[cohort].index))
    
                    
    result.rename(columns={"Sex": "Female %", "APOE4": "APOE4 %"}, inplace=True)
              
    return result

In [5]:
def num_quantiles(dfs, dfs_ctl, result):
    """Calculating std and mean and evalute quantiles under the ECDF for all cohorts"""
    
    column_num = ['Age', 'CDR', 'Education', 'MMSE', 'CDRSB', 'Hippocampus', 'A-beta', 'Ttau', 'Ptau']
    non_int_cols=["CDR", "CDRSB"]
    biomarker = ['Hippocampus', 'A-beta', 'Ttau', 'Ptau']
    
    for df, ctl_df in zip(dfs, dfs_ctl):
        
        dfn = dfs[df]
        dfn_ctl = dfs_ctl[ctl_df]
        calc_dict = dict()
        calc_dict_ctl = dict()
        
        for col in column_num:
            quants = []
            
            if (dfn.empty == False) & (col in dfn.columns)==True:
                
                if len(dfn.index.unique()) > 2:

                    # return nan if no data
                    if pd.isnull(dfn[col].quantile()):
                        calc_dict[col] = np.nan
                        continue

                    if col in non_int_cols:

                        for i in [.25, .5, .75]:
                            quants.append(round(dfn[col].quantile(i), 1))
                    else:

                        for i in [.25, .5, .75]:
                            quants.append(int(round(dfn[col].quantile(i), 0)))

                    # create and save string to return
                    calc_dict[col] =  str(quants[0]) + ', ' + str(quants[1]) + ', ' + str(quants[2])
                
                elif len(dfn.index.unique()) == 2:
                    
                    if col == 'Age':
                        quants = (list(dfn.iloc[0:][col].values))
                        calc_dict[col] = str(int(quants[0])) +  ', ' + str(int(quants[1]))
                     
                    else:
                        quants = (list(dfn.iloc[0:][col].values))
                        calc_dict[col] = str(round(quants[0], 1)) +  ', ' + str(round(quants[1], 1))
                
                else:
                    
                    if col == 'Age':
                        calc_dict[col] = int(dfn.iloc[0][col]) 
                    
                    else:
                        calc_dict[col] = round(dfn.iloc[0][col], 1)
                    

        for col in biomarker:
            if len(dfn_ctl.index.unique()) > 2:

                if (dfn_ctl.empty == False) & (col in dfn.columns)==True:

                    quants = []
                    ctl_dat = dfn_ctl[col].dropna()

                    # return nan if no data or no control distribution to compare to
                    if (pd.isnull(dfn[col].quantile())) or not (ctl_dat.any()):
                        calc_dict_ctl[col] = np.nan
                        continue

                    ecdf = ECDF(ctl_dat)

                    for i in [.25, .5, .75]:
                        value = int(round(ecdf(dfn[col].quantile(i)) * 100))
                        quants.append(value)

                    # create and save string to return
                    calc_dict_ctl[col] =  str(quants[0]) + ', ' + str(quants[1]) + ', ' + str(quants[2])
                
                else:
                    calc_dict_ctl[col] = np.nan
                

        for clin, bio in zip([calc_dict], [calc_dict_ctl]):
        
            for marker in biomarker:
                
                if (marker in clin) & (marker in bio): 

                    if pd.notnull(clin[marker]):
                        clin[marker] += " (" + str(bio[marker]) + ")"
                        
                else:
                    continue

        df_quan = pd.DataFrame(calc_dict, index=[df])
        
        for col in df_quan.columns:
            result.loc[df, col] = df_quan.loc[df, col]
            
    result.rename(columns={"Ttau": "tTau", "Ptau": "pTau"}, inplace=True)
        
    return result

## Make an empty dataframe to fill in with the results

In [6]:
results = pd.DataFrame(index = all_cohorts.keys(), columns = [col for col in all_cohorts['AIBL'].columns])
results.index.name = 'Name of Dataset'

for i in ['CU', 'MCI', 'AD', 'Total']:
    results[i] = np.nan

results.drop(columns=['Diagnosis', 'Visit'], inplace=True)

results = cat_stat_df(all_cohorts, results)
results = num_quantiles(all_cohorts, all_cohorts_ctl, results)

## Final table 

In [7]:
# sort columns
results = results[["n", "Total", "CU", "MCI", "AD", "Female %", "Age", "Education", "APOE4 %", 
                   "MMSE", "CDR", "CDRSB", "Hippocampus", "A-beta", "tTau", "pTau"]]

In [8]:
results

Unnamed: 0_level_0,n,Total,CU,MCI,AD,Female %,Age,Education,APOE4 %,MMSE,CDR,CDRSB,Hippocampus,A-beta,tTau,pTau
Name of Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A4,6945.0,6945.0,6945.0,0.0,0.0,57.7,"68, 71, 75","14, 16, 18",34.3,"28, 29, 30","0.0, 0.0, 0.0","0.0, 0.0, 0.0","6, 7, 7 (25, 50, 75)",,,
ABVIB,,0.0,,,,,,,,,,,,,,
ADNI,813.0,813.0,813.0,0.0,0.0,55.7,"68, 72, 77","15, 16, 18",30.3,"29, 29, 30","0.0, 0.0, 0.0","0.0, 0.0, 0.0","6838, 7409, 7909 (25, 50, 75)","821, 1274, 1700 (25, 50, 100)","176, 215, 289 (25, 50, 75)","15, 19, 26 (25, 50, 75)"
AIBL,803.0,803.0,803.0,0.0,0.0,57.4,"65, 70, 76","10, 12, 15",27.8,"28, 29, 30","0.0, 0.0, 0.0","0.0, 0.0, 0.0","3, 3, 3 (26, 50, 75)",,,
ANM,793.0,793.0,793.0,0.0,0.0,59.4,"71, 76, 78","10, 12, 16",25.3,"28, 29, 30","0.0, 0.0, 0.0","0.0, 0.0, 0.0","6481, 7076, 7671 (25, 50, 75)",,,
ARWIBO,1476.0,1476.0,1476.0,0.0,0.0,60.8,"40, 52, 64","8, 10, 13",19.3,"28, 29, 30","0.0, 0.0, 0.0",,"7278, 7925, 8592 (25, 50, 75)","631, 631, 631 (100, 100, 100)","556, 556, 556 (100, 100, 100)","95, 95, 95 (100, 100, 100)"
DOD-ADNI,181.0,181.0,181.0,0.0,0.0,1.1,"66, 68, 71","13, 15, 17",27.6,"28, 29, 30","0.0, 0.0, 0.0","0.0, 0.0, 0.5","7162, 7795, 8502 (25, 50, 75)","795, 1204, 1506 (25, 50, 75)","154, 196, 252 (25, 50, 75)","13, 17, 22 (25, 50, 75)"
EDSD,183.0,183.0,183.0,0.0,0.0,51.9,"66, 69, 72","11, 13, 16",32.4,"28, 29, 30",,,"7076, 7689, 8418 (25, 50, 75)",,,
EMIF,366.0,366.0,366.0,0.0,0.0,44.3,"60, 65, 70","11, 13, 16",,"29, 29, 30","0.0, 0.0, 0.0",,"7163, 7707, 8268 (25, 50, 75)","476, 595, 858 (25, 50, 75)","126, 195, 288 (25, 50, 75)","32, 41, 54 (26, 50, 75)"
EPAD,2071.0,2071.0,2071.0,0.0,0.0,56.2,"60, 66, 71","12, 15, 17",39.4,"28, 29, 30","0.0, 0.0, 0.5","0.0, 0.0, 0.5","4378, 4784, 5169 (25, 50, 75)","866, 1274, 1700 (25, 50, 100)","163, 202, 262 (25, 50, 75)","13, 17, 23 (25, 50, 75)"


### Outputs

In [9]:
results[["Female %", "Age", "Education", "APOE4 %", "MMSE", "CDR", "CDRSB", "Hippocampus", 
        "A-beta", "tTau", "pTau"]].to_csv("../adata_resources/CTL_summary_stats.csv")

In [10]:
print("N all cohorts: ", results["n"].sum())

N all cohorts:  33549.0
