### Imports

In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
import os
import glob
import copy
from random import sample

### Opening the CSV files 

In [2]:
dataframes = [pd.read_csv(file, sep=',') for file in sorted(glob.glob('../../feature_tables' + "/*."+'csv'))]
modalities = [file.split(".")[0] for file in sorted(os.listdir('../../feature_tables'))]

In [3]:
# make a dictionary that contains all modalities as a dataframe
all_features = dict()

for modal, df in zip(modalities, dataframes):
    table = modal.split(" - ")[1]
    all_features[table] = df
    
# drop irrelevant columns    
for moda in all_features:
    all_features[moda].drop(columns=['CURIE', 'Definition', 'Synonyms'], inplace=True)

In [4]:
# combine the modalities into a dataframe 
merged = pd.concat(all_features, ignore_index=True)

# replace the "no total score" as the test was performed but the total score was not reported 
merged.replace({"No total score.": np.nan}, inplace=True)

# fill all the nan cells with 0
merged.fillna(0, inplace=True)

In [5]:
# rank 2 is taboo
numeric_df = merged.loc[(merged.Rank!=1) & (merged.Rank!=2)] # select the features that are numerical measurements 
categoric_df = merged.loc[(merged.Rank==1) & (merged.Rank!=2)] # select the features that are categorical measurements

### Read the merged file for every cohort

In [6]:
datasets = [pd.read_csv(file, index_col=0, low_memory=False) for file in sorted(glob.glob('../../cohort_studies_full_data/' + "/*."+'csv'))]
cohorts = [file.split(".")[0] for file in sorted(os.listdir('../../cohort_studies_full_data/'))]

In [7]:
# make a dictionary that contains all modalities as a dataframe
cohort_studies = dict()

for cohort, dataset in zip(cohorts, datasets):
    cohort_n = cohort.split("_MERGE")[0]
    # reduce to BL visit and MCI participants
    cohort_studies[cohort_n] = dataset.loc[(dataset['Months']==0) & (dataset['Diagnosis'].astype(str)=='MCI')].copy()

In [8]:
# make the index column consistent among the cohort dataframes
for cohort in cohort_studies:
    if cohort!='JADNI':
        cohort_studies[cohort]['ID'] = cohort_studies[cohort].index
        cohort_studies[cohort] = cohort_studies[cohort].reset_index().set_index('ID')
        cohort_studies[cohort].dropna(axis=1, how='all', inplace=True) # drop columns with all NAN entries 

### Funtion: extracting the reported values for every available feature

In [9]:
def extract_features(df_dict, feature_df, result_dict):
    """make a dictionary containing dictionaries of each feature, the values of the inner dictionary are the 
    measurements for every available feature in each cohort study"""
    
    for feature in feature_df.Feature:
        
        # take columns that are same name as cohorts
        for cohort in feature_df.columns.intersection(df_dict.keys()):
            # select the feature name according to the respective cohort
            feat = feature_df.loc[feature_df['Feature']==feature, cohort].item()
            
            # in the cells containing "," there are multiple features mapped 
            # if there is no comma that means the mapping are 1 to 1
            # the value 0 represent the absence of the feature in the respective cohort
            if (feat!=0) and (", " not in feat):
                flag = False

                # in some cases we have multiple targets features for the mapping, we prioritize the 100 match if available
                for col in df_dict[cohort].columns:

                    if feat==col:
                        flag = True # if the 100% match was found 
                        l = list(df_dict[cohort][col].dropna())

                        # when there are measurements available for the features shuffle and store them 
                        if len(l)!=0:
                            result_dict[feature][feature + "." + cohort] = sample(l, len(l))

                    elif (feat in col) and (flag==False):
                        l = list(df_dict[cohort][col].dropna())

                         # when there are measurements available for the features shuffle and store them 
                        if len(l)!=0:
                            result_dict[feature][feature + "." + cohort] = sample(l, len(l))
                

            # in the cells containing "," there are multiple features mapped
            # when there is multiple, take the second one
            # the value 0 represent the absence of the feature in the respective cohort
            elif (feat!=0) and (", " in feat):
                # select the feature name according to the respective cohort
                feat_n = feature_df.loc[feature_df['Feature']==feature, cohort].item().split(", ")[1]
                flag = False
                    
                # in some cases we have multiple targets features for the mapping, we prioritize the 100 match if available
                for col in df_dict[cohort].columns:

                    if feat_n==col:
                        flag = True # if the 100% match was found 
                        l = list(df_dict[cohort][col].dropna())

                        # when there are measurements available for the features shuffle and store them
                        if len(l)!=0:
                            result_dict[feature][feature + "." + cohort] = sample(l, len(l))

                    elif (feat_n in col) and (flag==False):
                        l = list(df_dict[cohort][col].dropna())

                        # when there are measurements available for the features shuffle and store them
                        if len(l)!=0:
                            result_dict[feature][feature + "." + cohort] = sample(l, len(l))

### Results

In [10]:
# make a dictionary of dictionaries to store the results
result = dict()

# select the target features as outer dictionary's keys
for feat in numeric_df.Feature:
    avai_cohorts = dict()
    
    for cohort in numeric_df.columns.intersection(cohort_studies.keys()):
        
        # target feature names + the feature names for each cohort as inner dictionary's keys
        if numeric_df.loc[numeric_df.Feature == feat, cohort].item()!=0:
            avai_cohorts[feat + "." + cohort] = []
            
    result[feat]= avai_cohorts

# call the function to generate the tables for boxplots
extract_features(cohort_studies, numeric_df, result)

### Save the results into tsv files

In [11]:
# aibl did not report the age of the participnats, they reported the date of birth
del result['Age']['Age.AIBL']
del result['Age']['Age.ABVIB'] #only month and year of birth was reported 

# Certain measurements were collected as values in some cohorts and as categorical in others
# Remove the ones that are categorical as we can not plot them
del result['PiB PET']['PiB PET.AIBL'] # Positive, Negative
del result['AV45 PET']['AV45 PET.AIBL'] # Positive, Negative
del result['AV45 PET']['AV45 PET.NACC'] # Abnormally elevated amyloid on PET: 0=No, 1=Yes, 8=Unknown/not assessed
del result['AV45 PET']['AV45 PET.EMIF'] # 0.0 and 1.0

#convert each feature dictionary into a dataframe and save it as csv file 
for i in result:
    
    if (i=="Age") or (i=="Education"):
        
        for j in result[i]: 
            result[i][j] = list(map(int, result[i][j]))
        
    df = pd.DataFrame.from_dict(result[i], orient='index').transpose()
    df.index.name = 'Participant number'
    df.dropna(how='all', axis=1, inplace=True)
    
    if df.empty==False:
        df.to_csv(f"{i}.tsv", sep='\t', index_label='Participant number')