# Data Processing

Remove confounders for functional and structural measurements on the ABCD study.

### Imports

In [83]:
from constants import *

import pandas as pd
import numpy as np
import os
from pathlib import Path
import statsmodels.api as sm

ImportError: cannot import name 'pinv2' from 'scipy.linalg' (/opt/homebrew/lib/python3.9/site-packages/scipy/linalg/__init__.py)

In [68]:
raw_path = Path(os.path.join("../", "data", "raw"))
interim_path = Path(os.path.join("../", "data", "interim"))
processed_path = Path(os.path.join("../", "data", "processed"))

### Load Dataframes

In [70]:
# Load dataframes
c_stop = pd.read_csv(interim_path/"dev_c_stop_df.csv")
i_stop = pd.read_csv(interim_path/"dev_i_stop_df.csv")
mri = pd.read_csv(interim_path/"dev_mri_df.csv")
mri_vol = pd.read_csv(interim_path/"dev_mri_vol_df.csv")

print(c_stop.shape)
print(i_stop.shape)
print(mri.shape)
print(mri_vol.shape)

(11253, 432)
(11253, 432)
(11253, 73)
(11253, 48)


## Remove Confounders

In [71]:
income_features = [("demo_comb_income_v2_" + str(i) + ".0").strip() for i in range(1, 11) if i != 9]
demo_features = income_features + ["demo_race_black", "demo_race_other"]

def add_demo(input_df):
    df = input_df.copy()
    df["demo_comb_income_v2"] = c_stop["demo_comb_income_v2"].replace({999: None, 777: None})
    df = pd.get_dummies(df, columns=['demo_comb_income_v2'])
    df = df.drop("demo_comb_income_v2_9.0", axis=1) # most common

    df["demo_race_black"] = c_stop["demo_race_a_p___11"] == 1
    df["demo_race_other"] = (c_stop["demo_race_a_p___10"] != 1) & (c_stop["demo_race_a_p___11"] != 1)
    
    for f in demo_features:
        df[f] = df[f].replace({False: 0, True: 1})
    
    return df

i_stop = add_demo(i_stop)
mri = add_demo(mri)
mri_vol = add_demo(mri_vol)
c_stop = add_demo(c_stop)

# add age to mri
mri["interview_age"] = c_stop["interview_age"]
mri_vol["interview_age"] = c_stop["interview_age"]

In [81]:
### RESIDUALIZE
confounders = ["interview_age"] + demo_features

def residualize(input_df, features):
    df = input_df.copy()

    mean_age = np.mean(df["interview_age"])
    df[confounders] = df[confounders].fillna(0)
    
    for feature in features:
        endog = df[feature]
        df["Intercept"] = 1
        
        exog = df[["Intercept"] + confounders]
        md = sm.api.GLM(endog, exog, family=sm.families.Gaussian())
    
        md = md.fit()
        feature_name = feature + '_prime'

        df[feature_name] = df[feature] - (md.params[1] * (df["interview_age"] - mean_age))
        for i in range(1, len(confounders)):
            df[feature_name] = df[feature_name] - (md.params[i + 1] * df[confounders[i]])

    return df

In [82]:
i_stop = residualize(i_stop, i_stop_go_features)

AttributeError: module 'statsmodels' has no attribute 'api'