# Data Processing

Remove confounders for functional and structural measurements on the ABCD study.

### Imports

In [2]:
from constants import *

import pandas as pd
import numpy as np
import os
from pathlib import Path
import statsmodels.api as sm

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
raw_path = Path(os.path.join("../", "data", "raw"))
interim_path = Path(os.path.join("../", "data", "interim"))
processed_path = Path(os.path.join("../", "data", "processed"))

### Load Dataframes

In [11]:
# Load dataframes
c_stop = pd.read_csv(interim_path/"dev_c_stop_df.csv")
i_stop = pd.read_csv(interim_path/"dev_i_stop_df.csv")
mri = pd.read_csv(interim_path/"dev_mri_df.csv")

test_c_stop = pd.read_csv(interim_path/"test_c_stop_df.csv")
test_i_stop = pd.read_csv(interim_path/"test_i_stop_df.csv")
test_mri = pd.read_csv(interim_path/"test_mri_df.csv")

print("# Dev")
print(c_stop.shape)
print(i_stop.shape)
print(mri.shape)

print("# Test")
print(test_c_stop.shape)
print(test_i_stop.shape)
print(test_mri.shape)

# Dev
(11253, 432)
(11253, 432)
(11253, 74)
# Test
(1251, 432)
(1251, 432)
(1251, 74)


## Remove Confounders

In [12]:
income_features = [("demo_comb_income_v2_" + str(i) + ".0").strip() for i in range(1, 11) if i != 9]
demo_features = income_features + ["demo_race_black", "demo_race_other"]

def add_demo(input_df):
    df = input_df.copy()
    df["demo_comb_income_v2"] = c_stop["demo_comb_income_v2"].replace({999: None, 777: None})
    df = pd.get_dummies(df, columns=['demo_comb_income_v2'])
    df = df.drop("demo_comb_income_v2_9.0", axis=1) # most common

    df["demo_race_black"] = c_stop["demo_race_a_p___11"] == 1
    df["demo_race_other"] = (c_stop["demo_race_a_p___10"] != 1) & (c_stop["demo_race_a_p___11"] != 1)
    
    for f in demo_features:
        df[f] = df[f].replace({False: 0, True: 1})
    
    return df

i_stop = add_demo(i_stop)
mri = add_demo(mri)

# add age to mri
mri["interview_age"] = c_stop["interview_age"]

test_i_stop = add_demo(test_i_stop)
test_mri = add_demo(test_mri)
test_c_stop = add_demo(test_c_stop)

# add age to mri
test_mri["interview_age"] = test_c_stop["interview_age"]

c_stop = add_demo(c_stop)

In [13]:
print(c_stop.shape)
print(i_stop.shape)
print(mri.shape)

(11253, 442)
(11253, 442)
(11253, 86)


In [14]:
### RESIDUALIZE
confounders = ["interview_age"] + demo_features

def residualize(input_df, features, mri=False):
    print("Residualizing...")
    df = input_df.copy()

    mean_age = np.mean(df["interview_age"])
    df[confounders] = df[confounders].fillna(0)
    
    if mri == True:
        mean_volume = np.mean(df["smri_vol_scs_suprateialv"])
    
    for feature in features:
        endog = df[feature]
        df["Intercept"] = 1
        
        if mri == True:
            exog = df[["Intercept"] + ["interview_age", "smri_vol_scs_suprateialv"] + demo_features]
        else:
            exog = df[["Intercept"] + confounders]
        
        md = sm.GLM(endog, exog, family=sm.families.Gaussian())
    
        try:
            md = md.fit()
            feature_name = feature + '_prime'
            
            if mri == True:
                df[feature_name] = df[feature] - (md.params[1] * (df["interview_age"] - mean_age) + md.params[2] * (df["smri_vol_scs_suprateialv"]))
            else:
                df[feature_name] = df[feature] - (md.params[1] * (df["interview_age"] - mean_age))
            
            for i in range(1, len(confounders)):
                df[feature_name] = df[feature_name] - (md.params[i + 1] * df[confounders[i]])
                
            # remove original column
            df = df.drop(feature, axis=1)
            
        except:
            print("Continuing.")
            continue
            
    df = df.drop("Intercept", axis=1)
    
    return df

In [16]:
mri_columns = [f for f in mri.columns if f.startswith("smri") and f != "smri_vol_scs_suprateialv"]

In [17]:
c_stop_res = residualize(c_stop, c_stop_go_features)
i_stop_res = residualize(i_stop, i_stop_go_features)
mri_res = residualize(mri, mri_columns, mri=True)

test_c_stop_res = residualize(test_c_stop, c_stop_go_features)
test_i_stop_res = residualize(test_i_stop, i_stop_go_features)
test_mri_res = residualize(test_mri, mri_columns, mri=True)

Residualizing...
Residualizing...
Residualizing...
Residualizing...
Residualizing...
Residualizing...


In [29]:
print(c_stop_res.shape)
print(i_stop_res.shape)
print(mri_res.shape)

(11253, 442)
(11253, 442)
(11253, 86)


In [18]:
c_stop_res.to_csv(processed_path/"dev_c_stop_res.csv")
i_stop_res.to_csv(processed_path/"dev_i_stop_res.csv")
mri_res.to_csv(processed_path/"dev_mri_res.csv")

test_c_stop_res.to_csv(processed_path/"test_c_stop_res.csv")
test_i_stop_res.to_csv(processed_path/"test_i_stop_res.csv")
test_mri_res.to_csv(processed_path/"test_mri_res.csv")

print("Saved to processed.")

Saved to processed.
