In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from abcd.local.paths import core_path, output_path
from abcd.data.read_data import get_subjects_events, add_event_vars

In [3]:
# Fetch subjects and events
subjects_df, events_df = get_subjects_events()

In [4]:
# Filter out events to only leave the baselines
subjects_df, events_df = get_subjects_events()
baseline_events_df = events_df[events_df["eventname"] == "baseline_year_1_arm_1"] 
print("There are {} baseline events for {} subjects!".format(len(baseline_events_df), len(subjects_df)))

There are 9530 baseline events for 9879 subjects!


In [5]:
# Add mental health outcomes
MH_SCORES = {"cbcl_scr_syn_anxdep_t": "Anxious/Dep.",
             "cbcl_scr_syn_withdep_t": "Depression",
             "cbcl_scr_syn_somatic_t": "Somatic",
             "cbcl_scr_syn_social_t": "Social",
             "cbcl_scr_syn_attention_t": "Attention",
             "cbcl_scr_syn_rulebreak_t": "Rule-breaking",
             "cbcl_scr_syn_aggressive_t": "Aggressive",
             "cbcl_scr_syn_internal_t": "Internalizing",
             "cbcl_scr_syn_external_t": "Externalizing"}    

mental_health_file = os.path.join(core_path, "mental-health", "mh_p_cbcl.csv")
mh_events_df = add_event_vars(baseline_events_df, mental_health_file, vars=list(MH_SCORES.keys()))
print("From those, there are {} events with mental health scores".format(len(mh_events_df)))

From those, there are 9528 events with mental health scores


  df = pd.read_csv(path, sep=sep)


In [6]:
# Count missing values
"""
for column in mh_events_df.columns:
    nr_missing_values = mh_events_df[[column]].isna().sum().sum()
    if nr_missing_values > 0:
        print("Column {} has {} missing values".format(column, nr_missing_values))
"""
# Remove missing values
mh_events_df = mh_events_df.dropna()
print("After removing missing values, the df has {} events".format(len(mh_events_df)))

After removing missing values, the df has 9495 events


In [7]:
# Plot distribution of variables
import pygal
from pygal import Config
from abcd.plotting.pygal.rendering import display_html

plot = pygal.Box()
plot.title = 'Child behavior checklist CBCL syndrome scales'
for var_id, var_name in MH_SCORES.items():
    plot.add(var_name, list(mh_events_df[var_id]))
display_html(plot)


In [18]:
# Normalize in the (0, 1) range
from abcd.data.var_tailoring.normalization import normalize_var

for var_id in MH_SCORES.keys():
    mh_events_df = normalize_var(mh_events_df, var_id, var_id+"_n")
    
plot = pygal.Box()
plot.title = 'Child behavior checklist CBCL syndrome scales (after normalization)'    
for var_id, var_name in MH_SCORES.items():
    plot.add(var_name, list(mh_events_df[var_id+"_n"]))
display_html(plot)

In [19]:
from abcd.data.define_splits import save_restore_sex_fmri_splits
from abcd.data.NETWORKS import CONNECTIONS
from sklearn.linear_model import LinearRegression, SGDRegressor
from abcd.analysis.methods.sklearn_fitting import calculate_regession_results, plot_results_one_site

site_splits = save_restore_sex_fmri_splits(k=5)
feature_columns = CONNECTIONS

In [23]:
# Linear Regression results
y_column = "cbcl_scr_syn_internal_t_n"
model = LinearRegression()
results = calculate_regession_results(model, mh_events_df, site_splits, feature_columns, y_column)
results      

100%|██████████| 21/21 [00:02<00:00,  7.46it/s]


Unnamed: 0,split,MAE mean,MAE std,Max. error mean,Max. error std
0,Train,0.153322,0.000379,0.697271,0.008307
1,ID Test,0.154966,0.000912,0.667171,0.008141
2,OOD Test,0.154817,0.006641,0.589719,0.058785


In [25]:
plot_results_one_site(model, mh_events_df, site_splits, feature_columns, y_column, y_column_name=MH_SCORES["cbcl_scr_syn_internal_t"], site_id="site01")

In [None]:
#