## Imports 

In [1]:
from statistics import mean
import numpy as np
import pandas as pd
import math
import os
from collections import Counter
from functools import reduce
import glob
import copy

## Opening the CSV files 

In [2]:
dataframes = [pd.read_csv(file, sep=',', index_col=0) for file in sorted(glob.glob('../preprocessed_datasets' + "/*."+'csv'))]
cohorts = [file.strip(".csv") for file in sorted(os.listdir('../preprocessed_datasets'))]

In [3]:
# reduce to BL visit only
all_cohorts = dict()
for name, df in zip(cohorts, dataframes):
    all_cohorts[name] = df.loc[df["Visit"] == 1]

## Functions to perform essential calculations 

In [4]:
def cat_stat_df(dfs, result):
    """Counting different categories, calculate the % of categorical features, store results in a df"""
    
    categorical = {'APOE4': [2.0, 1.0], 'Sex': ['Female'], 'Diagnosis': ['CU', 'MCI', 'AD']}
    column_cat = ['Sex', 'Diagnosis', 'APOE4']

    for cohort in dfs:
        
        if dfs[cohort].empty==True:
            continue
        
        else:
            calc_dict = dict()
            df = dfs[cohort]

            for col in column_cat:
                ca = Counter(df[col].dropna())
                calc_dict[col] = ca

            cohort_df = pd.DataFrame(calc_dict).transpose()
            cohort_df = cohort_df.dropna(how='all')
            cohort_df.loc[cohort] = cohort_df.sum()
           
            for i in categorical:
                
                if i == 'Diagnosis':
                    
                    if i in cohort_df.index: 
                        result.loc[cohort, categorical[i]] = cohort_df.loc[cohort, cohort_df.loc[i].notna()].astype(int)
                        result.loc[cohort, categorical[i]] = result.loc[cohort, categorical[i]].replace({np.nan: 0})
                        result.loc[cohort, 'n'] = int(sum(cohort_df.loc[cohort, cohort_df.loc[i].notna()]))
                        result.loc[cohort, 'Total'] = int(len(dfs[cohort].index))
                    
                    else:
                        result.loc[cohort, i] = np.nan
                        result.loc[cohort, 'n'] = int(len(dfs[cohort].index))
                
                elif i == 'APOE4':
                    
                    if 'APOE4' in list(cohort_df.index.astype(str)):
                        
                        if '2.0' not in list(cohort_df.columns.astype(str)) and '2' not in list(cohort_df.columns.astype(str)):
                            cohort_df[2.0] = np.nan
                        
                        result.loc[cohort, i] = round(100 * sum([val for val in cohort_df.loc[i, categorical[i]]]) / 
                                                     sum([val for val in cohort_df.loc[i].dropna()]), 1)
                    
                    else:
                        result.loc[cohort, i] = np.nan
                
                elif i == 'Sex':
                    
                    if (i in cohort_df.index) & ("Female" in cohort_df.columns):
                        result.loc[cohort, i] = round(100 * sum([val for val in cohort_df.loc[i, categorical[i]]]) 
                                                      / sum([val for val in cohort_df.loc[i].dropna()]), 1)
                    else:
                        result.loc[cohort, i] = 0
    
                    
    result.rename(columns={"Sex": "Female %", "APOE4": "APOE4 %"}, inplace=True)
              
    return result

In [5]:
def num_stat_df(dfs, result_df):
    """Calculating std and mean and storing it in the result dataframe"""
    
    column_names = ['Age', 'CDR', 'Education', 'MMSE', 'CDRSB', 'Hippocampus', 'A-beta', 'Ttau', 'Ptau']
    
    for df in dfs:
        dataset = dfs[df]
        calc_dict = dict()
        
        for col in column_names:
            
            if (col in dataset.columns) and (dataset[col].notna().any()):
                df_std = round(np.nanstd(dataset[col]), 1)
                df_mean = round(np.nanmean(dataset[col]), 1)
                dict_value = str(df_mean) + ' (' + str(df_std) + ')'
                calc_dict[col] = dict_value
                
            else:
                calc_dict[col] = np.nan
   
        for key in calc_dict:
            result_df.loc[df, key] = calc_dict[key]
        
    return result_df

## Make an empty dataframe to fill in with the results

In [6]:
results = pd.DataFrame(index = all_cohorts.keys(), columns = [col for col in all_cohorts['AIBL'].columns])
results.index.name = 'Name of Dataset'

for i in ['CU', 'MCI', 'AD', 'Total']:
    results[i] = np.nan

cat_stat_df(all_cohorts, results)
num_stat_df(all_cohorts, results)

results.drop(columns=['Diagnosis', 'Visit', 'Race', 'Months'], inplace=True)
results

Unnamed: 0_level_0,Age,Female %,Education,APOE4 %,CDR,MMSE,CDRSB,Hippocampus,Ttau,Ptau,A-beta,CU,MCI,AD,Total,n
Name of Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A4,71.6 (4.9),57.7,16.4 (3.0),34.3,0.0 (0.1),28.6 (1.6),0.1 (0.4),6.7 (0.8),,,,6945.0,0.0,0.0,6945.0,6945.0
ABVIB,77.9 (6.4),42.9,17.1 (9.4),,,,,,,,,,,,,280.0
ADNI,73.2 (7.4),47.0,16.0 (2.8),45.6,0.4 (0.3),27.4 (2.7),1.5 (1.8),6790.1 (1184.9),287.0 (132.7),27.6 (14.6),979.9 (457.1),813.0,1016.0,389.0,2249.0,2218.0
AIBL,73.2 (7.8),57.9,12.3 (3.0),36.0,0.3 (0.5),26.6 (4.6),1.3 (2.6),2.8 (0.4),438.8 (276.1),68.5 (30.5),633.0 (241.8),803.0,134.0,181.0,1378.0,1118.0
ANM,76.4 (6.9),59.3,10.9 (4.5),38.8,0.5 (0.6),25.8 (5.0),2.5 (3.6),6233.6 (1237.1),,,,793.0,397.0,512.0,1703.0,1702.0
ARWIBO,60.4 (16.4),60.8,9.3 (4.7),29.3,0.3 (0.5),26.0 (4.5),,7227.0 (1442.1),459.1 (300.8),75.9 (49.9),504.0 (235.0),1476.0,208.0,281.0,2617.0,1965.0
DOD-ADNI,69.5 (4.6),0.9,15.2 (2.4),26.6,0.2 (0.2),28.4 (1.6),0.4 (0.7),7792.4 (935.3),216.3 (82.3),19.1 (8.3),1219.4 (498.8),181.0,27.0,0.0,458.0,208.0
EDSD,70.7 (7.2),52.1,11.9 (3.6),47.0,,25.5 (4.7),,6802.5 (1400.0),450.2 (275.6),80.8 (40.3),665.1 (345.9),183.0,140.0,151.0,474.0,474.0
EMIF,68.0 (8.3),45.9,11.7 (4.1),,0.4 (0.3),26.5 (3.7),,7094.2 (1169.6),377.5 (327.5),60.6 (34.7),586.4 (281.9),366.0,526.0,201.0,1199.0,1093.0
EPAD,65.9 (7.5),56.1,14.4 (3.7),39.4,0.1 (0.2),28.4 (1.9),0.4 (0.8),4729.8 (774.0),225.8 (99.5),19.8 (10.6),1216.2 (429.4),2071.0,0.0,14.0,2096.0,2085.0


## Final table 

In [7]:
results[['n', 'Total', 'CU', 'MCI', 'AD', 'Female %', 'Age', 'Education', 'MMSE', 'CDR', 'CDRSB', 'APOE4 %', 'Hippocampus']]

Unnamed: 0_level_0,n,Total,CU,MCI,AD,Female %,Age,Education,MMSE,CDR,CDRSB,APOE4 %,Hippocampus
Name of Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A4,6945.0,6945.0,6945.0,0.0,0.0,57.7,71.6 (4.9),16.4 (3.0),28.6 (1.6),0.0 (0.1),0.1 (0.4),34.3,6.7 (0.8)
ABVIB,280.0,,,,,42.9,77.9 (6.4),17.1 (9.4),,,,,
ADNI,2218.0,2249.0,813.0,1016.0,389.0,47.0,73.2 (7.4),16.0 (2.8),27.4 (2.7),0.4 (0.3),1.5 (1.8),45.6,6790.1 (1184.9)
AIBL,1118.0,1378.0,803.0,134.0,181.0,57.9,73.2 (7.8),12.3 (3.0),26.6 (4.6),0.3 (0.5),1.3 (2.6),36.0,2.8 (0.4)
ANM,1702.0,1703.0,793.0,397.0,512.0,59.3,76.4 (6.9),10.9 (4.5),25.8 (5.0),0.5 (0.6),2.5 (3.6),38.8,6233.6 (1237.1)
ARWIBO,1965.0,2617.0,1476.0,208.0,281.0,60.8,60.4 (16.4),9.3 (4.7),26.0 (4.5),0.3 (0.5),,29.3,7227.0 (1442.1)
DOD-ADNI,208.0,458.0,181.0,27.0,0.0,0.9,69.5 (4.6),15.2 (2.4),28.4 (1.6),0.2 (0.2),0.4 (0.7),26.6,7792.4 (935.3)
EDSD,474.0,474.0,183.0,140.0,151.0,52.1,70.7 (7.2),11.9 (3.6),25.5 (4.7),,,47.0,6802.5 (1400.0)
EMIF,1093.0,1199.0,366.0,526.0,201.0,45.9,68.0 (8.3),11.7 (4.1),26.5 (3.7),0.4 (0.3),,,7094.2 (1169.6)
EPAD,2085.0,2096.0,2071.0,0.0,14.0,56.1,65.9 (7.5),14.4 (3.7),28.4 (1.9),0.1 (0.2),0.4 (0.8),39.4,4729.8 (774.0)
