## Imports

In [1]:
from statistics import mean
import numpy as np
import pandas as pd 
import math
from collections import Counter
from random import sample
import os
import glob
import copy

## Read CSV Files

In [2]:
dataframes = [pd.read_csv(file, sep=',', index_col=0) for file in sorted(glob.glob('../../preprocessed_datasets' + "/*."+'csv'))]
cohorts = [file.strip(".csv") for file in sorted(os.listdir('../../preprocessed_datasets'))]

In [3]:
# reduce to BL visit and AD participants only
all_cohorts = dict()
for name, df in zip(cohorts, dataframes):
    all_cohorts[name] = df.loc[(df["Visit"]==1) & (df["Diagnosis"].astype(str)=='AD')].copy()
    
# convert the value type to float to keep it unified
for i in all_cohorts:
    all_cohorts[i]['APOE4'] = all_cohorts[i]['APOE4'].astype(float)

## Functions to Perform Essential Tasks

In [4]:
def extract_features(df_dict, result_dict):
    """make a dictionary containing feature-dataframe and store the number of participants for each 
    type of measurements."""
    
    for feature in result_dict:
        if feature=='Sex':

            for cohort in df_dict:
                df = df_dict[cohort]

                if feature in df.dropna(axis=1, how='all').columns:

                    # for each type of measurments, store the number of participants
                    for i in result_dict[feature].columns:
                        result_dict[feature].loc[cohort, i] = len(df.loc[df[feature]==i].index.unique())
                        
        else :

            for cohort in df_dict:
                df = df_dict[cohort]

                if feature in df.dropna(axis=1, how='all').columns:

                    # for each type of measurments, store the number of participants
                    for i in result_dict[feature].columns:
                        result_dict[feature].loc[cohort, i] = len(df.loc[df[feature]==float(i)].index.unique())
                        
        # drop the cohorts that did not collect the features 
        result_dict[feature].dropna(axis=0, how='all', inplace=True)

### Results

In [5]:
# a dictionary of dictionaries for storing the results
result = dict()
result['Sex'] = pd.DataFrame(index=all_cohorts.keys(), columns=['Female', 'Male'])
result['APOE4'] = pd.DataFrame(index=all_cohorts.keys(), columns=['0.0', '1.0', '2.0'])
result['CDR'] = pd.DataFrame(index=all_cohorts.keys(), columns=[0.0, 0.5, 1.0, 2.0, 3.0])

# call the function to extract the number of participant for each feature for each type of measurement
extract_features(all_cohorts, result)

In [6]:
sex = result['Sex']
del result['Sex']
result['Biological Sex'] = sex

### Save the results into tsv files

In [7]:
#convert each feature dictionary into a dataframe and save it as csv file 
for feature in result:
    result[feature].index.name = 'Participant number'
    result[feature].to_csv(f"{feature}.tsv", sep='\t', index_label='Participant number')