In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
table_path_fstring = "/home/ebrahim/data/abcd/Package_1200530/{}.txt"
dict_path_fstring = "/home/ebrahim/data/abcd/abcd-4.0-data-dictionaries/{}.csv"

def read_abcd_table(table_name):
    table_path = table_path_fstring.format(table_name)
    dict_path = dict_path_fstring.format(table_name)
    data_dictionary = pd.read_csv(dict_path, index_col='ElementName')
    df = pd.read_csv(table_path, sep='\t', header=0, skiprows=[1])
    return df, data_dictionary

In [None]:
# Load mental health data and demographic data
mh_tab, mh_dd = read_abcd_table("abcd_ksad01") # Mental health
demo_tab, demo_dd = read_abcd_table("pdem02") # Demographic
mri_tab, mri_dd = read_abcd_table('abcd_mri01') # Scanner info
lt_tab, lt_dd = read_abcd_table('abcd_lt01') # Study site info

In [None]:
# Verify they have the exact same number of subjects
assert(len(mh_tab.subjectkey.unique()) == len(demo_tab.subjectkey.unique()))
assert(len(lt_tab.subjectkey.unique()) == len(demo_tab.subjectkey.unique()))

# Except MRI info is missing some subjects
print(len(lt_tab.subjectkey.unique()) - len(mri_tab.subjectkey.unique()), "subjects do not show up in scanner info table.")

In [None]:
# A bit slow; skip this cell. The purpose is to verify that any pair of rows with the same
# subject key have all other data also matching (or nan). So there is just duplication for some reason.
peup = demo_tab.groupby('subjectkey').agg(lambda s : s.nunique())
((peup==0) | (peup==1)).all().all()

In [None]:
# A bit slow; skip this cell. The purpose is to verify that any pair of rows with the same
# subject key  and interview age have all other data also matching.
# So there is duplication for some reason
# and we can safely select the first element at each (subjectkey, interview_age) pair.
peup = mri_tab.groupby(['subjectkey', 'interview_age']).agg(lambda s : s.nunique())
(peup==1).all().all()

In [None]:
# # A bit slow; skip this cell. Similar to above but for lt_tab.
# This time it's the eventname, and not the interview_age, that should give a unique row for a given subject
# (Weirdly, for the same subject and same interview age, it is possible for more than one event to occur;
# for example the 18 month follow-up and the 2 year follow-up showing up with the same interview age)
peup = lt_tab.groupby(['subjectkey', 'eventname']).agg(lambda s : s.nunique())
((peup==0) | (peup==1)).all().all()

In the cells below, I didn't know of `drop_duplicates`, which is what I should have used.

In [None]:
# Just take the first of each duplicated set of rows
demo_tab_fixed = demo_tab.groupby('subjectkey', as_index=False).apply(lambda s : s.iloc[0])
mri_tab_fixed = mri_tab.groupby(['subjectkey', 'interview_age'], as_index=False).apply(lambda s : s.iloc[0])
lt_tab_fixed = lt_tab.groupby(['subjectkey', 'eventname'], as_index=False).apply(lambda s : s.iloc[0])

In [None]:
# I've already verified in the notebook "abcd_bipolar_richness" that there's duplication in mh_tab
# So also let's just take the first of each pair (where this time it's both subjectkey and interview age
# that index the rows with unique information)
mh_tab_fixed = mh_tab.groupby(['subjectkey', 'interview_age'], as_index=False).apply(lambda s: s.iloc[0])

In [None]:
# Function to help take stratified samples
def take_sample_stratified_by(df, by, frac):
    return df.groupby(by, as_index=False, group_keys=False).apply(lambda x : x.sample(frac=frac))

In [None]:
# example taking sample stratified by gender
take_sample_stratified_by(demo_tab_fixed, ['demo_gender_id_v2'], 0.005).demo_gender_id_v2.value_counts()

In [None]:
# Let's see how many people selected more than one race
race_binary_cols = [f'demo_race_a_p___{n}' for n in list(range(10,26))+[77,99]]
demo_tab_fixed[race_binary_cols].apply(lambda x : x.sum(), axis=1).value_counts()

Okay, race looks a little more complicated, so let's ignore that and just hope random sampling takes care of it.

I'm also not going to take mental health info from `mh_tab` into the stratification for now. We don't have much of a justification at this point to stratify by mental health info.

In [None]:
# Let's see how many people ended up at multiple different sites
( lt_tab_fixed.groupby(['subjectkey']).site_id_l.agg(lambda x : x.nunique()) ).value_counts()

Hmm so there are 157 people who ended up at more than one study site.
So if we choose to stratify our sampling by study site then we'd have to do it at the "event" level and not at the "subject" level.

In [None]:
# Let's see how many people ended up with multiple different scanner models
( mri_tab_fixed.groupby(['subjectkey']).mri_info_manufacturersmn.agg(lambda x : x.nunique()) ).value_counts()

So 465 people had scans with multiple different scanner models. If we choose to stratify our sampling by scanner model then we'd have to do it at the "event" level and not at the "subject" level.

Below I do stratification at the subject level only, so for now I ignore site and scanner in the sampling.

In [None]:
stratify_by = [
    'demo_gender_id_v2', # Gender identity
    'demo_brthdat_v2' # Baseline age
]

In [None]:
sample = take_sample_stratified_by(demo_tab_fixed, stratify_by, 0.005)
sample.subjectkey.to_csv('sampled_subjectkeys.csv')

In [None]:
table_path = table_path_fstring.format("fmriresults01")
dmri_df = pd.read_csv(table_path, sep='\t', header=0, skiprows=[1])

In [None]:
# There appears to also be a duplication of rows in the fmriresults01 table
# The following line includes 'fmriresults01_id' in the grouping to fix the duplication.
# The reason for this is that there are sometimes multiple scans for the same subject with the same interview age
# (I believe the reason for this is that the kids
# sometimes had to take a break in the middle of a scan session)
dmri_df_fixed = dmri_df.groupby(['subjectkey', 'interview_age', 'fmriresults01_id']).apply(lambda x : x.iloc[0])

In [None]:
sample_dmri = dmri_df_fixed.loc[dmri_df_fixed.subjectkey.isin(sample.subjectkey)]
sample_dmri = sample_dmri.reset_index(drop=True)

In [None]:
sample_dmri.to_csv('sampled_fmriresults01.csv')

In [None]:
with open('sample_derived_files.txt', 'w') as f:
    for item in sample_dmri.derived_files:
        f.write("%s\n" % item)