# Data Selection

In this notebook, we select the variables relevant to the Stop Signal Task (SST) of the ABCD study. The dataset comes from the ABCD Study: https://abcdstudy.org/.

### Step 1: Import Python Packages

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from pathlib import Path
from ipywidgets import interact
import ipywidgets as widgets

# Visualization parameters
%matplotlib inline
a4_dims = (9.7, 3.27)
plt.rcParams['figure.figsize'] = (7, 5)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 500
plt.rcParams["figure.autolayout"] = True

pd.set_option('mode.chained_assignment', None)

### Step 2: Load the data

We are interested in the following contrasts:

SST: correct Stop versus correct Go 

SST: incorrect Stop versus correct Go.

We will be using the prefix `tfsstabwdp_` to filter out all of these contrasts using the `Destrieux` parcellation ATLAS.

In [63]:
data = Path(os.path.join("../", "data", "raw"))

abcd_demo = pd.read_csv(data/"abcd_p_demo.csv")
abcd_lt = pd.read_csv(data/"abcd_y_lt.csv")
df1 = pd.read_csv(data/"mri_y_tfmr_sst_csvcg_dst.csv")
df2 = pd.read_csv(data/"mri_y_tfmr_sst_isvcg_dst.csv")

# merge data frames into one
data_df = df1.merge(df2, on=["src_subject_id", "eventname"], how="left")

# filter out sst data
sst_columns = data_df.filter(like="tfsstabwdp_")
Destrieux_SST = pd.concat([data_df[["src_subject_id", "eventname"]], sst_columns], axis=1)

In [64]:
len(Destrieux_SST.src_subject_id.unique())

11027

### Step 3: Count null values

In [65]:
null_counts = Destrieux_SST.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

tfsstabwdp_302    9
tfsstabwdp_316    9
tfsstabwdp_324    9
tfsstabwdp_396    9
tfsstabwdp_322    9
                 ..
tfsstabwdp_272    5
tfsstabwdp_198    5
tfsstabwdp_271    5
tfsstabwdp_197    5
tfsstabwdp_154    5
Length: 296, dtype: int64

In [66]:
# remove rows with Null values
Destrieux_SST = Destrieux_SST.dropna()

new_null_counts = Destrieux_SST.isnull().sum()
new_null_counts[new_null_counts > 0]

Series([], dtype: int64)

### Step 4: Add contrasts information

Next, we want to extract the following contrasts:

- SST: correct Stop versus correct Go

- SST: incorrect Stop versus correct Go

The correct Stop versus correct Go run from tfsstabwdp_149 to tfsstabwdp_296.

The incorrect Stop versus correct Go run from tfsstabwdp_297 to tfsstabwdp_444.






In [67]:
Destrieux_SST

Unnamed: 0,src_subject_id,eventname,tfsstabwdp_154,tfsstabwdp_228,tfsstabwdp_155,tfsstabwdp_229,tfsstabwdp_156,tfsstabwdp_230,tfsstabwdp_149,tfsstabwdp_223,...,tfsstabwdp_366,tfsstabwdp_440,tfsstabwdp_367,tfsstabwdp_441,tfsstabwdp_368,tfsstabwdp_442,tfsstabwdp_369,tfsstabwdp_443,tfsstabwdp_370,tfsstabwdp_444
0,NDAR_INV003RTV85,baseline_year_1_arm_1,-0.149605,-0.174828,-0.013256,0.113432,-0.101847,-0.062040,-0.097332,-0.158990,...,-0.255056,-0.327929,-0.112484,-0.052497,-0.014438,-0.211108,0.067975,0.138177,0.029652,-0.102042
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,-0.172440,-0.223380,0.074744,0.060927,0.003457,-0.076963,0.175204,-0.164309,...,-1.888047,-1.416510,-0.398687,-0.636936,-0.466801,-0.354012,-0.325601,-0.256631,-0.057869,-0.414039
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,-0.010771,0.083618,0.055696,-0.003107,0.104688,0.016953,-0.231481,-0.268936,...,-0.170761,-0.178557,0.202939,0.037519,0.417104,0.668192,0.148352,0.095323,0.145479,0.154221
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,-0.053103,-0.004048,0.046030,0.086685,0.063254,0.155009,0.124548,0.067561,...,-0.246516,-0.331469,-0.137232,-0.181366,0.147939,-0.192708,-0.056710,-0.008076,-0.201146,-0.007578
4,NDAR_INV00CY2MDM,2_year_follow_up_y_arm_1,-0.129385,-0.053957,0.148208,0.106023,0.050831,0.143123,-0.033714,0.112141,...,-0.021743,0.106453,0.074073,0.115022,0.168943,0.162317,0.142121,0.181069,0.188156,0.095772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19927,NDAR_INVZZZ2ALR6,baseline_year_1_arm_1,0.088177,0.158888,0.118404,0.108464,-0.016163,0.078596,-0.094376,0.073550,...,0.124193,0.200333,0.246803,0.383458,-0.024918,0.247412,0.278051,0.491906,0.323814,0.249895
19928,NDAR_INVZZZNB0XC,4_year_follow_up_y_arm_1,0.247949,0.208909,0.343190,0.300724,0.203630,0.267437,0.109268,0.122724,...,-0.628015,-0.330611,-0.131158,-0.122427,0.140082,0.020478,-0.055349,-0.096313,-0.147136,-0.059786
19929,NDAR_INVZZZNB0XC,baseline_year_1_arm_1,-0.148778,-0.091126,0.064281,0.178064,-0.124523,-0.011796,-0.091755,0.034466,...,-0.500629,-0.987296,-0.590315,-0.442530,-0.227099,0.173324,-0.217835,-0.178233,-0.288869,-0.212007
19930,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1,-0.087253,-0.075054,-0.059327,-0.064608,-0.023440,0.020002,0.007485,-0.136996,...,0.229794,0.328085,0.250301,0.197008,0.075787,-0.041175,0.100473,0.097594,0.320063,0.276801


In [68]:
### NOTE: WE GET ALL RECORDS WHERE WE HAVE MALE VS. FEMALE AND THE SAME SUBJECT IN BOTH BASELINE AND FOLLOWUP
### THIS RESULTS IN 6254 UNIQUE RECORDS

# Filter for records where male/female status is known
baseline_demo = abcd_demo.loc[abcd_demo.eventname == "baseline_year_1_arm_1"]
baseline_demo = baseline_demo.loc[baseline_demo["demo_sex_v2"].isin([1, 2])]
b_subject_ids = baseline_demo["src_subject_id"].unique()

baseline_lt = abcd_lt.loc[abcd_lt.eventname == "baseline_year_1_arm_1"]
followup_lt = abcd_lt.loc[abcd_lt.eventname == "2_year_follow_up_y_arm_1"]

# Separate into baseline and two-year followup
baseline_df = Destrieux_SST.loc[Destrieux_SST.eventname == "baseline_year_1_arm_1"]
followup_df = Destrieux_SST.loc[Destrieux_SST.eventname == "2_year_follow_up_y_arm_1"]

b_subject_ids = baseline_df.loc[baseline_df["src_subject_id"].isin(b_subject_ids)]["src_subject_id"].unique()
f_subject_ids = followup_df.loc[followup_df["src_subject_id"].isin(b_subject_ids)]["src_subject_id"].unique()

baseline_df = baseline_df.loc[baseline_df["src_subject_id"].isin(f_subject_ids)]
followup_df = followup_df.loc[followup_df["src_subject_id"].isin(f_subject_ids)]

baseline_demo = baseline_demo.loc[baseline_demo["src_subject_id"].isin(f_subject_ids)]
baseline_lt = baseline_lt.loc[baseline_lt["src_subject_id"].isin(f_subject_ids)]
followup_lt = followup_lt.loc[followup_lt["src_subject_id"].isin(f_subject_ids)]

baseline_demo = baseline_demo.drop("eventname", axis=1)
baseline_lt = baseline_lt.drop("eventname", axis=1)
followup_lt = followup_lt.drop("eventname", axis=1)

baseline_df = baseline_df.merge(baseline_demo, on=["src_subject_id"])
baseline_df = baseline_df.merge(baseline_lt, on=["src_subject_id"])

followup_df = followup_df.merge(baseline_demo, on=["src_subject_id"])
followup_df = followup_df.merge(followup_lt, on=["src_subject_id"])

In [69]:
baseline_df

Unnamed: 0,src_subject_id,eventname,tfsstabwdp_154,tfsstabwdp_228,tfsstabwdp_155,tfsstabwdp_229,tfsstabwdp_156,tfsstabwdp_230,tfsstabwdp_149,tfsstabwdp_223,...,race_ethnicity,acs_raked_propensity_score,site_id_l,rel_family_id,rel_birth_id,school_id,district_id,interview_date,interview_age,visit_type
0,NDAR_INV00CY2MDM,baseline_year_1_arm_1,0.001988,-0.061784,0.025479,0.023070,0.084555,0.120872,-0.072277,0.108144,...,1.0,1433.061575,site20,5355.0,53551.0,,,08/22/2017,130.0,1.0
1,NDAR_INV00HEV6HB,baseline_year_1_arm_1,0.156079,0.085460,-0.171020,0.079661,-0.081916,-0.045603,-0.557458,-0.269451,...,2.0,650.876929,site12,2257.0,22571.0,,,07/08/2017,124.0,1.0
2,NDAR_INV00U4FTRU,baseline_year_1_arm_1,0.180306,0.126086,-0.031762,0.027738,-0.043941,-0.041289,0.161269,-0.086785,...,5.0,1778.916737,site04,2464.0,24641.0,,6815.0,05/19/2018,130.0,1.0
3,NDAR_INV00X2TBWJ,baseline_year_1_arm_1,0.122249,-0.082942,0.181219,0.122269,0.045643,0.117616,0.032588,-0.441620,...,3.0,907.279771,site14,3692.0,36921.0,,13889.0,05/12/2017,130.0,1.0
4,NDAR_INV010ZM3H9,baseline_year_1_arm_1,-0.247205,-0.735447,0.756608,-0.053257,-0.225100,-0.163556,-2.946458,-3.648983,...,1.0,550.058750,site12,1862.0,18621.0,,14043.0,05/05/2018,112.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,-0.056455,-0.036248,-0.052365,0.030967,0.010967,0.019874,-0.179490,-0.048514,...,1.0,375.709776,site06,9345.0,93451.0,,10342.0,01/11/2018,124.0,1.0
6250,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,0.117272,0.081566,0.233194,0.018394,-0.018188,0.082699,-0.595456,-0.438072,...,2.0,802.807890,site11,8433.0,84332.0,91674.0,9674.0,07/11/2017,108.0,1.0
6251,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,0.025210,0.164892,-0.030853,-0.081547,-0.018977,0.031020,0.398041,0.132955,...,1.0,389.141755,site06,9347.0,93471.0,,,08/26/2017,110.0,1.0
6252,NDAR_INVZZPKBDAC,baseline_year_1_arm_1,-0.021217,-0.039354,0.162215,0.197989,0.173467,0.111320,0.173190,0.130388,...,1.0,568.508011,site12,2445.0,24451.0,,,01/20/2018,113.0,1.0


In [70]:
followup_df

Unnamed: 0,src_subject_id,eventname,tfsstabwdp_154,tfsstabwdp_228,tfsstabwdp_155,tfsstabwdp_229,tfsstabwdp_156,tfsstabwdp_230,tfsstabwdp_149,tfsstabwdp_223,...,race_ethnicity,acs_raked_propensity_score,site_id_l,rel_family_id,rel_birth_id,school_id,district_id,interview_date,interview_age,visit_type
0,NDAR_INV00CY2MDM,2_year_follow_up_y_arm_1,-0.129385,-0.053957,0.148208,0.106023,0.050831,0.143123,-0.033714,0.112141,...,1.0,1433.061575,site20,,,,,06/15/2019,152.0,1.0
1,NDAR_INV00HEV6HB,2_year_follow_up_y_arm_1,-0.424897,-0.308583,-0.177224,-0.155726,-0.273635,-0.255610,-0.526887,-0.206159,...,2.0,650.876929,site12,,,,,08/05/2019,149.0,1.0
2,NDAR_INV00U4FTRU,2_year_follow_up_y_arm_1,0.079677,0.133159,0.087999,-0.002313,0.186444,0.168833,0.138955,0.001202,...,5.0,1778.916737,site04,,,,6815.0,09/02/2020,157.0,3.0
3,NDAR_INV00X2TBWJ,2_year_follow_up_y_arm_1,0.181363,0.181648,0.196699,0.133660,0.105910,0.042437,0.562238,-0.116366,...,3.0,907.279771,site14,,,5949.0,13889.0,05/10/2019,154.0,1.0
4,NDAR_INV010ZM3H9,2_year_follow_up_y_arm_1,-0.051536,-0.071374,-0.017478,-0.015395,0.049456,0.073359,0.098185,-0.160475,...,1.0,550.058750,site12,,,,14043.0,03/08/2020,134.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,NDAR_INVZZ6ZJ2KY,2_year_follow_up_y_arm_1,-0.000053,0.068060,0.057578,0.131181,0.075422,0.029315,0.090219,0.022789,...,1.0,375.709776,site06,,,44827.0,10342.0,02/19/2020,150.0,1.0
6250,NDAR_INVZZ81LEEV,2_year_follow_up_y_arm_1,0.023479,-0.069686,0.074941,0.051139,0.059338,0.108405,-0.188261,0.114417,...,2.0,802.807890,site11,,,91674.0,9674.0,04/29/2019,130.0,1.0
6251,NDAR_INVZZLZCKAY,2_year_follow_up_y_arm_1,-0.656909,-1.034874,-0.482399,-0.063011,0.232971,0.159990,-11.399078,-13.210508,...,1.0,389.141755,site06,,,,,06/14/2019,131.0,1.0
6252,NDAR_INVZZPKBDAC,2_year_follow_up_y_arm_1,-0.023196,0.048996,0.070226,0.067451,0.008806,0.068069,-0.053043,-0.315634,...,1.0,568.508011,site12,,,,,01/11/2020,136.0,1.0


In [71]:
sum(followup_df["src_subject_id"].unique() == baseline_df["src_subject_id"].unique())

6254

In [72]:
EXTRA_COLUMNS = list(abcd_demo.columns) + list(abcd_lt.columns)
EXTRA_COLUMNS = [c for c in EXTRA_COLUMNS if c not in ["src_subject_id", "eventname"]]

c_stop_go_features = ["tfsstabwdp_" + str(i) for i in range(149, 297)]
i_stop_go_features = ["tfsstabwdp_" + str(i) for i in range(297, 444 + 1)]

# Separate into two dataframes for the different contrasts
baseline_c_stop_go_df = baseline_df[["src_subject_id", "eventname"] + c_stop_go_features + EXTRA_COLUMNS]
baseline_i_stop_go_df = baseline_df[["src_subject_id", "eventname"] + i_stop_go_features + EXTRA_COLUMNS]

followup_c_stop_go_df = followup_df[["src_subject_id", "eventname"] + c_stop_go_features + EXTRA_COLUMNS]
followup_i_stop_go_df = followup_df[["src_subject_id", "eventname"] + i_stop_go_features + EXTRA_COLUMNS]

# 0 = Male, 1 = Female
baseline_c_stop_go_df["demo_sex_v2"].replace({1.0: 0, 2.0: 1}, inplace=True)
baseline_i_stop_go_df["demo_sex_v2"].replace({1.0: 0, 2.0: 1}, inplace=True)
followup_c_stop_go_df["demo_sex_v2"].replace({1.0: 0, 2.0: 1}, inplace=True)
followup_i_stop_go_df["demo_sex_v2"].replace({1.0: 0, 2.0: 1}, inplace=True)

print("Divided dataframe into different contrasts...")

Divided dataframe into different contrasts...


In [73]:
# Drop rows with missing values (row number 2944, 5046)
baseline_c_stop_go_df = baseline_c_stop_go_df.drop([2944, 5046], axis=0)
baseline_i_stop_go_df = baseline_i_stop_go_df.drop([2944, 5046], axis=0)
followup_c_stop_go_df = followup_c_stop_go_df.drop([2944, 5046], axis=0)
followup_i_stop_go_df = followup_i_stop_go_df.drop([2944, 5046], axis=0)

In [46]:
subject_ids = baseline_c_stop_go_df["src_subject_id"]

#### MRI
mri = pd.read_csv(data/"mri_y_smr_thk_dsk.csv")
mri = mri[mri["src_subject_id"].isin(subject_ids)]

mri_baseline = mri[mri["eventname"] == "baseline_year_1_arm_1"]
mri_followup = mri[mri["eventname"] == "2_year_follow_up_y_arm_1"]

#### MRI VOL
mri_vol = pd.read_csv(data/"mri_y_smr_vol_aseg.csv")
mri_vol = mri_vol[mri_vol["src_subject_id"].isin(subject_ids)]

mri_vol_baseline = mri_vol[mri_vol["eventname"] == "baseline_year_1_arm_1"]
mri_vol_followup = mri_vol[mri_vol["eventname"] == "2_year_follow_up_y_arm_1"]

In [47]:
print(baseline_c_stop_go_df.shape)
print(baseline_i_stop_go_df.shape)
print(followup_c_stop_go_df.shape)
print(followup_i_stop_go_df.shape)
print(mri_area_baseline.shape)

(6252, 432)
(6252, 432)
(6252, 432)
(6252, 432)
(6252, 73)


In [75]:
### MERGE INTO C_STOP, I_STOP, MRI, MRI_VOL
c_stop_df = pd.concat([baseline_c_stop_go_df, followup_c_stop_go_df])
i_stop_df = pd.concat([baseline_i_stop_go_df, followup_i_stop_go_df])
mri_df = pd.concat([mri_baseline, mri_followup])
mri_vol_df = pd.concat([mri_vol_baseline, mri_vol_followup])

mri_df["smri_vol_scs_suprateialv"] = mri_vol_df["smri_vol_scs_suprateialv"]

print(c_stop_df.shape)
print(i_stop_df.shape)
print(mri_df.shape)
print(mri_vol_df.shape)

(12504, 432)
(12504, 432)
(12504, 74)
(12504, 48)


In [76]:
### CREATE A "NO-TOUCH" 10% DATASET THAT IS NOT LOOKED AT UNTIL THE VERY END
interim = Path(os.path.join("../", "data", "interim"))

#dataframes = [baseline_c_stop_go_df, baseline_i_stop_go_df, followup_c_stop_go_df, followup_i_stop_go_df, mri_baseline, mri_followup]
#filenames = ["baseline_c_stop_go.csv", "baseline_i_stop_go.csv", "followup_c_stop_go.csv", "followup_i_stop_go.csv"]

dataframes = [c_stop_df, i_stop_df, mri_df, mri_vol_df]
filenames = ["c_stop_df.csv", "i_stop_df.csv", "mri_df.csv", "mri_vol_df.csv"]

for i, df in enumerate(dataframes):
    df = df.sort_values("src_subject_id")

    # 90/10 split
    dev_size = int(0.9 * len(df))
    test_size = len(df) - dev_size
    
    dev_idx = np.arange(0, dev_size)
    test_idx = np.arange(dev_size, dev_size + test_size)

    dev_df = df.iloc[dev_idx]
    test_df = df.iloc[test_idx]

    # Save to files
    dev_filename = interim/f"dev_{filenames[i]}"
    dev_df.to_csv(dev_filename, index=False)
    print(f"Saved dev dataframe to {dev_filename}...")

    test_filename = interim/f"test_{filenames[i]}"
    test_df.to_csv(test_filename, index=False)
    print(f"Saved test dataframe to {test_filename}...\n")

Saved dev dataframe to ../data/interim/dev_c_stop_df.csv...
Saved test dataframe to ../data/interim/test_c_stop_df.csv...

Saved dev dataframe to ../data/interim/dev_i_stop_df.csv...
Saved test dataframe to ../data/interim/test_i_stop_df.csv...

Saved dev dataframe to ../data/interim/dev_mri_df.csv...
Saved test dataframe to ../data/interim/test_mri_df.csv...

Saved dev dataframe to ../data/interim/dev_mri_vol_df.csv...
Saved test dataframe to ../data/interim/test_mri_vol_df.csv...

