### Imports

In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
import os
import glob
import copy

In [2]:
dataframes = [pd.read_csv(file, sep='\t') for file in sorted(glob.glob('feature_availability_in_cohorts/' + "/*."+'tsv'))]
modalities = [file.split(".")[0] for file in sorted(os.listdir('feature_availability_in_cohorts/'))]

In [3]:
# make a dictionary that contains all modalities as a dataframe
all_mappings = dict()

for modal, df in zip(modalities, dataframes):
    all_mappings[modal] = df
    
# remove the only irrelevant table that was in the same folder     
del all_mappings['nonexistence_features']

### Count the number of mapped feature per cohort study
#### Count the ones that exist in the dataframe. If the score is 1, it indicates that the measurements were reported in the dataset. However, rake 0 indicates that the measurements were not collected.

In [4]:
for moda in all_mappings:
    all_mappings[moda] = all_mappings[moda].set_index("Feature")

In [5]:
# make a dictionary to store the result, keys are table names and values are dataframes
result = dict()

# for each table, calculate the number of cohort that have the measurments (i.e. per feature)
for i in all_mappings:
    df = pd.DataFrame.from_dict(dict(all_mappings[i].eq(1).sum(axis=1)), orient='index', columns=['total'])
    result[i] = df

In [6]:
result['clinical_i']

Unnamed: 0,total
Diagnosis,18
Clinical Dementia Rating (CDR),17
Clinical Dementia Rating Scale Sum of Boxes (CDRSB),8
Mini-Mental State Examination (MMSE),17
Montreal Cognitive Assessment (MoCA),5
Geriatric Depression Scale (GDS),10
Alzheimer's Disease Assessment Scale 11-item (ADAS11),5
Alzheimer’s Disease Assessment Scale 13-item (ADAS13),4
Functional Activities Questionnaire (FAQ),5
Neuropsychiatric Inventory (NPI),3


### Save to tsv

In [7]:
for modal in result:
    result[modal].to_csv("num_mapped_feat_total/" + f"{modal}.tsv", sep='\t')