# Predictive Modeling on the ABCD study dataset: Data Exploration

In this notebook, we explore the Stop Signal Task (SST) of the ABCD study. The dataset comes from the ABCD Study: https://abcdstudy.org/

In [2]:
# DO NOT HAVE ACCESS TO AGE AT THE TIME?

### Step 1: Import Python Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from pathlib import Path
from ipywidgets import interact
import ipywidgets as widgets

# Visualization parameters
%matplotlib inline
a4_dims = (9.7, 3.27)
plt.rcParams['figure.figsize'] = (7, 5)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 500
plt.rcParams["figure.autolayout"] = True

### Step 2: Load the data

We are interested in the following contrasts:

SST: correct Stop versus correct Go 

SST: incorrect Stop versus correct Go.

We will be using the prefix `tfsstabwdp_` to filter out all of these contrasts using the `Destrieux` parcellation ATLAS.

In [5]:
data = Path(os.path.join("../", "data"))

abcd_demo = pd.read_csv(data/"abcd_p_demo.csv")
df1 = pd.read_csv(data/"mri_y_tfmr_sst_csvcg_dst.csv")
df2 = pd.read_csv(data/"mri_y_tfmr_sst_isvcg_dst.csv")

# merge data frames into one
data_df = df1.merge(df2, on=["src_subject_id", "eventname"], how="left")

# filter out sst data
sst_columns = data_df.filter(like="tfsstabwdp_")
Destrieux_SST = pd.concat([data_df[["src_subject_id", "eventname"]], sst_columns], axis=1)

In [6]:
len(Destrieux_SST.src_subject_id.unique())

11027

### Step 3: Count null values

In [7]:
null_counts = Destrieux_SST.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

tfsstabwdp_302    9
tfsstabwdp_316    9
tfsstabwdp_324    9
tfsstabwdp_396    9
tfsstabwdp_322    9
                 ..
tfsstabwdp_272    5
tfsstabwdp_198    5
tfsstabwdp_271    5
tfsstabwdp_197    5
tfsstabwdp_154    5
Length: 296, dtype: int64

In [8]:
# remove rows with Null values
Destrieux_SST = Destrieux_SST.dropna()

new_null_counts = Destrieux_SST.isnull().sum()
new_null_counts[new_null_counts > 0]

Series([], dtype: int64)

### Step 4: Add contrasts information

Next, we want to extract the following contrasts:

- SST: correct Stop versus correct Go

- SST: incorrect Stop versus correct Go

The correct Stop versus correct Go run from tfsstabwdp_149 to tfsstabwdp_296.

The incorrect Stop versus correct Go run from tfsstabwdp_297 to tfsstabwdp_444.






In [9]:
Destrieux_SST

Unnamed: 0,src_subject_id,eventname,tfsstabwdp_154,tfsstabwdp_228,tfsstabwdp_155,tfsstabwdp_229,tfsstabwdp_156,tfsstabwdp_230,tfsstabwdp_149,tfsstabwdp_223,...,tfsstabwdp_366,tfsstabwdp_440,tfsstabwdp_367,tfsstabwdp_441,tfsstabwdp_368,tfsstabwdp_442,tfsstabwdp_369,tfsstabwdp_443,tfsstabwdp_370,tfsstabwdp_444
0,NDAR_INV003RTV85,baseline_year_1_arm_1,-0.149605,-0.174828,-0.013256,0.113432,-0.101847,-0.062040,-0.097332,-0.158990,...,-0.255056,-0.327929,-0.112484,-0.052497,-0.014438,-0.211108,0.067975,0.138177,0.029652,-0.102042
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,-0.172440,-0.223380,0.074744,0.060927,0.003457,-0.076963,0.175204,-0.164309,...,-1.888047,-1.416510,-0.398687,-0.636936,-0.466801,-0.354012,-0.325601,-0.256631,-0.057869,-0.414039
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,-0.010771,0.083618,0.055696,-0.003107,0.104688,0.016953,-0.231481,-0.268936,...,-0.170761,-0.178557,0.202939,0.037519,0.417104,0.668192,0.148352,0.095323,0.145479,0.154221
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,-0.053103,-0.004048,0.046030,0.086685,0.063254,0.155009,0.124548,0.067561,...,-0.246516,-0.331469,-0.137232,-0.181366,0.147939,-0.192708,-0.056710,-0.008076,-0.201146,-0.007578
4,NDAR_INV00CY2MDM,2_year_follow_up_y_arm_1,-0.129385,-0.053957,0.148208,0.106023,0.050831,0.143123,-0.033714,0.112141,...,-0.021743,0.106453,0.074073,0.115022,0.168943,0.162317,0.142121,0.181069,0.188156,0.095772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19927,NDAR_INVZZZ2ALR6,baseline_year_1_arm_1,0.088177,0.158888,0.118404,0.108464,-0.016163,0.078596,-0.094376,0.073550,...,0.124193,0.200333,0.246803,0.383458,-0.024918,0.247412,0.278051,0.491906,0.323814,0.249895
19928,NDAR_INVZZZNB0XC,4_year_follow_up_y_arm_1,0.247949,0.208909,0.343190,0.300724,0.203630,0.267437,0.109268,0.122724,...,-0.628015,-0.330611,-0.131158,-0.122427,0.140082,0.020478,-0.055349,-0.096313,-0.147136,-0.059786
19929,NDAR_INVZZZNB0XC,baseline_year_1_arm_1,-0.148778,-0.091126,0.064281,0.178064,-0.124523,-0.011796,-0.091755,0.034466,...,-0.500629,-0.987296,-0.590315,-0.442530,-0.227099,0.173324,-0.217835,-0.178233,-0.288869,-0.212007
19930,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1,-0.087253,-0.075054,-0.059327,-0.064608,-0.023440,0.020002,0.007485,-0.136996,...,0.229794,0.328085,0.250301,0.197008,0.075787,-0.041175,0.100473,0.097594,0.320063,0.276801


In [78]:
### NOTE: WE GET ALL RECORDS WHERE WE HAVE MALE VS. FEMALE AND THE SAME SUBJECT IN BOTH BASELINE AND FOLLOWUP
### THIS RESULTS IN 6254 UNIQUE RECORDS

# Filter for records where male/female status is known
DEMO_COLUMNS = ["demo_sex_v2", "demo_brthdat_v2"]
SRC_AND_DEMO_COLUMNS = ["src_subject_id"] + DEMO_COLUMNS

baseline_demo = abcd_demo.loc[abcd_demo.eventname == "baseline_year_1_arm_1"]
followup_demo = abcd_demo.loc[abcd_demo.eventname == "2_year_follow_up_y_arm_1"]

baseline_demo = baseline_demo.loc[baseline_demo["demo_sex_v2"].isin([1, 2])]
baseline_demo = baseline_demo[SRC_AND_DEMO_COLUMNS]
b_subject_ids = baseline_demo["src_subject_id"].unique()

followup_demo = followup_demo.loc[followup_demo["src_subject_id"].isin(b_subject_ids)]
followup_demo = followup_demo[SRC_AND_DEMO_COLUMNS]

# Separate into baseline and two-year followup
baseline_df = Destrieux_SST.loc[Destrieux_SST.eventname == "baseline_year_1_arm_1"]
followup_df = Destrieux_SST.loc[Destrieux_SST.eventname == "2_year_follow_up_y_arm_1"]

b_subject_ids = baseline_df.loc[baseline_df["src_subject_id"].isin(b_subject_ids)]["src_subject_id"].unique()
f_subject_ids = followup_df.loc[followup_df["src_subject_id"].isin(b_subject_ids)]["src_subject_id"].unique()

baseline_df = baseline_df.loc[baseline_df["src_subject_id"].isin(f_subject_ids)].merge(baseline_demo, on=["src_subject_id"])
followup_df = followup_df.loc[followup_df["src_subject_id"].isin(f_subject_ids)].merge(baseline_demo, on=["src_subject_id"])

In [79]:
baseline_df

Unnamed: 0,src_subject_id,eventname,tfsstabwdp_154,tfsstabwdp_228,tfsstabwdp_155,tfsstabwdp_229,tfsstabwdp_156,tfsstabwdp_230,tfsstabwdp_149,tfsstabwdp_223,...,tfsstabwdp_367,tfsstabwdp_441,tfsstabwdp_368,tfsstabwdp_442,tfsstabwdp_369,tfsstabwdp_443,tfsstabwdp_370,tfsstabwdp_444,demo_sex_v2,demo_brthdat_v2
0,NDAR_INV00CY2MDM,baseline_year_1_arm_1,0.001988,-0.061784,0.025479,0.023070,0.084555,0.120872,-0.072277,0.108144,...,-0.091349,-0.015412,-0.056517,0.008476,0.113112,0.037077,0.363842,0.117866,1.0,10.0
1,NDAR_INV00HEV6HB,baseline_year_1_arm_1,0.156079,0.085460,-0.171020,0.079661,-0.081916,-0.045603,-0.557458,-0.269451,...,0.188224,0.199589,0.022870,0.094763,0.055854,0.096612,0.092752,0.338330,1.0,10.0
2,NDAR_INV00U4FTRU,baseline_year_1_arm_1,0.180306,0.126086,-0.031762,0.027738,-0.043941,-0.041289,0.161269,-0.086785,...,0.089396,0.248830,0.265644,0.288229,0.200084,0.210433,-0.177977,-0.081459,2.0,10.0
3,NDAR_INV00X2TBWJ,baseline_year_1_arm_1,0.122249,-0.082942,0.181219,0.122269,0.045643,0.117616,0.032588,-0.441620,...,-0.650502,-0.329056,0.186446,-0.475751,-0.319254,-0.180275,0.064805,0.310237,2.0,10.0
4,NDAR_INV010ZM3H9,baseline_year_1_arm_1,-0.247205,-0.735447,0.756608,-0.053257,-0.225100,-0.163556,-2.946458,-3.648983,...,-0.010917,0.291357,0.555856,0.354529,0.167112,0.238722,0.074551,-0.116797,1.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,-0.056455,-0.036248,-0.052365,0.030967,0.010967,0.019874,-0.179490,-0.048514,...,0.151839,0.067883,0.085518,0.256401,0.124144,0.081280,0.039077,0.017679,2.0,10.0
6250,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,0.117272,0.081566,0.233194,0.018394,-0.018188,0.082699,-0.595456,-0.438072,...,-0.155276,-0.072306,-0.018330,-0.120625,-0.011435,-0.073285,-0.205149,-0.730087,1.0,9.0
6251,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,0.025210,0.164892,-0.030853,-0.081547,-0.018977,0.031020,0.398041,0.132955,...,-0.021433,-0.046190,0.216747,-0.094131,-0.044741,0.014849,-0.425063,-0.124528,2.0,9.0
6252,NDAR_INVZZPKBDAC,baseline_year_1_arm_1,-0.021217,-0.039354,0.162215,0.197989,0.173467,0.111320,0.173190,0.130388,...,-0.040376,-0.027597,0.046160,-0.053243,-0.019757,0.059940,-0.027809,0.003229,2.0,9.0


In [80]:
followup_df

Unnamed: 0,src_subject_id,eventname,tfsstabwdp_154,tfsstabwdp_228,tfsstabwdp_155,tfsstabwdp_229,tfsstabwdp_156,tfsstabwdp_230,tfsstabwdp_149,tfsstabwdp_223,...,tfsstabwdp_367,tfsstabwdp_441,tfsstabwdp_368,tfsstabwdp_442,tfsstabwdp_369,tfsstabwdp_443,tfsstabwdp_370,tfsstabwdp_444,demo_sex_v2,demo_brthdat_v2
0,NDAR_INV00CY2MDM,2_year_follow_up_y_arm_1,-0.129385,-0.053957,0.148208,0.106023,0.050831,0.143123,-0.033714,0.112141,...,0.074073,0.115022,0.168943,0.162317,0.142121,0.181069,0.188156,0.095772,1.0,10.0
1,NDAR_INV00HEV6HB,2_year_follow_up_y_arm_1,-0.424897,-0.308583,-0.177224,-0.155726,-0.273635,-0.255610,-0.526887,-0.206159,...,0.118090,0.090322,0.019427,-0.087042,0.051624,0.032760,0.058516,0.164985,1.0,10.0
2,NDAR_INV00U4FTRU,2_year_follow_up_y_arm_1,0.079677,0.133159,0.087999,-0.002313,0.186444,0.168833,0.138955,0.001202,...,-0.007191,0.264584,0.337145,0.258847,0.068696,0.140752,0.058789,0.101559,2.0,10.0
3,NDAR_INV00X2TBWJ,2_year_follow_up_y_arm_1,0.181363,0.181648,0.196699,0.133660,0.105910,0.042437,0.562238,-0.116366,...,0.070007,0.115758,0.285918,0.142432,0.089174,0.142611,0.166475,0.055976,2.0,10.0
4,NDAR_INV010ZM3H9,2_year_follow_up_y_arm_1,-0.051536,-0.071374,-0.017478,-0.015395,0.049456,0.073359,0.098185,-0.160475,...,-0.510454,-0.369393,-0.236031,-0.491348,-0.292113,-0.116190,-0.208025,-0.561162,1.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,NDAR_INVZZ6ZJ2KY,2_year_follow_up_y_arm_1,-0.000053,0.068060,0.057578,0.131181,0.075422,0.029315,0.090219,0.022789,...,-0.158502,0.130759,-0.155325,-0.027902,-0.081070,0.130289,-0.058614,-0.202098,2.0,10.0
6250,NDAR_INVZZ81LEEV,2_year_follow_up_y_arm_1,0.023479,-0.069686,0.074941,0.051139,0.059338,0.108405,-0.188261,0.114417,...,-0.061758,0.028571,0.036838,0.084608,-0.094601,-0.006263,0.063620,0.049315,1.0,9.0
6251,NDAR_INVZZLZCKAY,2_year_follow_up_y_arm_1,-0.656909,-1.034874,-0.482399,-0.063011,0.232971,0.159990,-11.399078,-13.210508,...,-0.086918,0.004311,0.299082,0.744725,0.014859,0.201640,0.065857,0.058375,2.0,9.0
6252,NDAR_INVZZPKBDAC,2_year_follow_up_y_arm_1,-0.023196,0.048996,0.070226,0.067451,0.008806,0.068069,-0.053043,-0.315634,...,0.061925,0.034149,-0.117123,-0.155142,0.123427,0.121889,-0.096733,-0.009967,2.0,9.0


In [84]:
c_stop_go_features = ["tfsstabwdp_" + str(i) for i in range(149, 297)]
i_stop_go_features = ["tfsstabwdp_" + str(i) for i in range(297, 444 + 1)]

# Separate into two dataframes for the different contrasts
baseline_c_stop_go_df = baseline_df[["src_subject_id", "eventname"] + c_stop_go_features + DEMO_COLUMNS]
baseline_i_stop_go_df = baseline_df[["src_subject_id", "eventname"] + i_stop_go_features + DEMO_COLUMNS]

followup_c_stop_go_df = followup_df[["src_subject_id", "eventname"] + c_stop_go_features + DEMO_COLUMNS]
followup_i_stop_go_df = followup_df[["src_subject_id", "eventname"] + i_stop_go_features + DEMO_COLUMNS]

In [85]:
print(baseline_c_stop_go_df.shape)
print(baseline_i_stop_go_df.shape)
print(followup_c_stop_go_df.shape)
print(followup_i_stop_go_df.shape)

(6254, 152)
(6254, 152)
(6254, 152)
(6254, 152)


In [86]:
### CREATE A "NO-TOUCH" 10% DATASET THAT IS NOT LOOKED AT UNTIL THE VERY END
dataframes = [baseline_c_stop_go_df, baseline_i_stop_go_df, followup_c_stop_go_df, followup_i_stop_go_df]
filenames = ["baseline_c_stop_go.csv", "baseline_i_stop_go.csv", "followup_c_stop_go.csv", "followup_i_stop_go.csv"]

for i, df in enumerate(dataframes):
    df = df.sort_values("src_subject_id")

    # 90/10 split
    dev_size = int(0.9 * len(df))
    test_size = len(df) - dev_size
    
    dev_idx = np.arange(0, dev_size)
    test_idx = np.arange(dev_size, dev_size + test_size)

    dev_df = df.iloc[dev_idx]
    test_df = df.iloc[test_idx]

    # Save to files
    dev_filename = f"../output/dev_{filenames[i]}"
    dev_df.to_csv(dev_filename, index=False)
    print(f"Saved dev dataframe to {dev_filename}...")

    test_filename = f"../output/test_{filenames[i]}"
    test_df.to_csv(test_filename, index=False)
    print(f"Saved test dataframe to {test_filename}...\n")

Saved dev dataframe to ../output/dev_baseline_c_stop_go.csv...
Saved test dataframe to ../output/test_baseline_c_stop_go.csv...

Saved dev dataframe to ../output/dev_baseline_i_stop_go.csv...
Saved test dataframe to ../output/test_baseline_i_stop_go.csv...

Saved dev dataframe to ../output/dev_followup_c_stop_go.csv...
Saved test dataframe to ../output/test_followup_c_stop_go.csv...

Saved dev dataframe to ../output/dev_followup_i_stop_go.csv...
Saved test dataframe to ../output/test_followup_i_stop_go.csv...



In [61]:
abcd_demo

Unnamed: 0,src_subject_id,eventname,demoi_p_select_language___1,demo_prim,demo_brthdat_v2,demo_ed_v2,demo_adopt_agex_v2,demo_adopt_agex_v2_bl_dk,demo_sex_v2,demo_gender_id_v2,...,demo_nat_lang_3_yrs_other_p_14,demo_nat_lang_3_p,demo_nat_lang_3_yrs_eng_p___1,demo_nat_lang_3_yrs_eng_p___2,demo_nat_lang_3_yrs_eng_p___3,demo_nat_lang_3_yrs_eng_p___4,demo_nat_lang_3_yrs_eng_p___5,demo_nat_lang_3_yrs_eng_p___6,race_ethnicity,acs_raked_propensity_score
0,NDAR_INV003RTV85,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,1.0,466.092707
1,NDAR_INV003RTV85,1_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,1.0,533.381820
2,NDAR_INV003RTV85,2_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,,
3,NDAR_INV003RTV85,3_year_follow_up_y_arm_1,0,,,,,,,,...,,0.0,,,,,,,,
4,NDAR_INV005V6D2C,baseline_year_1_arm_1,1,1.0,10.0,4.0,,,2.0,2.0,...,,,,,,,,,3.0,520.488325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48802,NDAR_INVZZZP87KR,baseline_year_1_arm_1,0,1.0,10.0,5.0,,,2.0,2.0,...,,,,,,,,,2.0,835.604891
48803,NDAR_INVZZZP87KR,1_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,2.0,842.080358
48804,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1,0,,,,,,,,...,,,,,,,,,,
48805,NDAR_INVZZZP87KR,3_year_follow_up_y_arm_1,0,,,,,,,,...,,0.0,,,,,,,,


In [65]:
abcd_demo.columns[100:200]

Index(['fam_roster_3c_v2', 'fam_roster_4c_v2', 'fam_roster_5c_v2',
       'fam_roster_6c_v2', 'fam_roster_7c_v2', 'fam_roster_8c_v2',
       'fam_roster_9c_v2', 'fam_roster_10c_v2', 'fam_roster_11c_v2',
       'fam_roster_12c_v2', 'fam_roster_13c_v2', 'fam_roster_14c_v2',
       'fam_roster_15c_v2', 'demo_child_time_v2', 'demo_child_time2_v2',
       'demo_child_time2_v2_dk', 'demo_child_time3_v2', 'demo_yrs_1',
       'demo_yrs_2', 'demo_yrs_2a_2', 'demo_yrs_2b_2',
       'demo_yrs_2_no_display___1', 'demo_race_a_p___0',
       'demo_l_p_select_language___1', 'demo_prim_l', 'demo_brthdat_v2_l',
       'demo_ed_v2_l', 'demo_gender_id_v2_l', 'demo_nat_lang_l',
       'demo_nat_lang_2_l', 'demo_dual_lang_v2_l',
       'demo_dual_lang_years_p___1', 'demo_dual_lang_years_p___2',
       'demo_dual_lang_years_p___3', 'demo_dual_lang_years_p___4',
       'demo_dual_lang_years_p___5', 'demo_dual_lang_years_p___6',
       'demo_dual_lang_years_p___7', 'demo_dual_lang_years_p___8',
       'demo_

In [68]:
"demo_brthdat_v2" in abcd_demo.columns

True

In [69]:
"interview_age" in abcd_demo.columns

False

In [71]:
abcd_demo["demo_brthdat_v2"].head(10)

0    10.0
1     NaN
2     NaN
3     NaN
4    10.0
5     NaN
6     NaN
7     NaN
8    10.0
9     NaN
Name: demo_brthdat_v2, dtype: float64