In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
table_path_fstring = "/home/ebrahim/data/abcd/Package_1200530/{}.txt"
dict_path_fstring = "/home/ebrahim/data/abcd/abcd-4.0-data-dictionaries/{}.csv"

def read_abcd_table(table_name):
    table_path = table_path_fstring.format(table_name)
    dict_path = dict_path_fstring.format(table_name)
    data_dictionary = pd.read_csv(dict_path, index_col='ElementName')
    df = pd.read_csv(table_path, sep='\t', header=0, skiprows=[1])
    return df, data_dictionary

In [3]:
# Load mental health data and demographic data
mh_tab, mh_dd = read_abcd_table("abcd_ksad01")
demo_tab, demo_dd = read_abcd_table("pdem02")

In [4]:
# Verify they have the exact same number of subjects
len(mh_tab.subjectkey.unique()) == len(demo_tab.subjectkey.unique())

True

In [54]:
# A bit slow; skip this cell. The purpose is to verify that any pair of rows with the same
# subject key have all other data also matching (or nan). So there is just duplication for some reason.
peup = demo_tab.groupby('subjectkey').agg(lambda s : s.nunique())
((peup==0) | (peup==1)).all().all()

True

In [5]:
# Just take the first of each pair then
demo_tab_fixed = demo_tab.groupby('subjectkey', as_index=False).apply(lambda s : s.iloc[0])

In [6]:
# I've already verified in the notebook "abcd_bipolar_richness" that there's duplication in mh_tab
# So also let's just take the first of each pair (where this time it's both subjectkey and interview age
# that index the rows with unique information)
mh_tab_fixed = mh_tab.groupby(['subjectkey', 'interview_age'], as_index=False).apply(lambda s: s.iloc[0])

In [90]:
# Function to help take stratified samples
def take_sample_stratified_by(df, by, frac):
    return df.groupby(by, as_index=False, group_keys=False).apply(lambda x : x.sample(frac=frac))

In [103]:
# example taking sample stratified by gender
take_sample_stratified_by(demo_tab_fixed, ['demo_gender_id_v2'], 0.005).demo_gender_id_v2.value_counts()

1.0    31
2.0    28
Name: demo_gender_id_v2, dtype: int64

In [128]:
# Let's see how many people selected more than one race
race_binary_cols = [f'demo_race_a_p___{n}' for n in list(range(10,26))+[77,99]]
demo_tab_fixed[race_binary_cols].apply(lambda x : x.sum(), axis=1).value_counts()

1    10372
2     1253
3      192
4       30
0       25
5        4
dtype: int64

Okay, race looks a little more complicated, so let's ignore that and just hope random sampling takes care of it.

I'm also not going to take mental health info from `mh_tab` into the stratification for now. We don't have much of a justification at this point to stratify by mental health info.

In [139]:
stratify_by = [
    'demo_gender_id_v2', # Gender identity
    'demo_brthdat_v2' # Baseline age
]

In [343]:
sample = take_sample_stratified_by(demo_tab_fixed, stratify_by, 0.005)
sample.subjectkey.to_csv('sampled_subjectkeys.csv')

In [7]:
table_path = table_path_fstring.format("fmriresults01")
dmri_df = pd.read_csv(table_path, sep='\t', header=0, skiprows=[1])

In [8]:
# There appears to also be a duplication of rows in the fmriresults01 table
# The following line includes 'fmriresults01_id' in the grouping to fix the duplication.
# The reason for this is that there are sometimes multiple scans for the same subject with the same interview age
# (I believe the reason for this is that the kids
# sometimes had to take a break in the middle of a scan session)
dmri_df_fixed = dmri_df.groupby(['subjectkey', 'interview_age', 'fmriresults01_id']).apply(lambda x : x.iloc[0])

In [356]:
sample_dmri = dmri_df_fixed.loc[dmri_df_fixed.subjectkey.isin(sample.subjectkey)]
sample_dmri = sample_dmri.reset_index(drop=True)

In [357]:
sample_dmri.to_csv('sampled_fmriresults01.csv')

In [370]:
with open('sample_derived_files.txt', 'w') as f:
    for item in sample_dmri.derived_files:
        f.write("%s\n" % item)