In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from abcd.local.paths import core_path, output_path
from abcd.data.read_data import get_subjects_events, add_event_vars

In [3]:
# Fetch subjects and events
subjects_df, events_df = get_subjects_events()

In [4]:
# Filter out events to only leave the baselines
subjects_df, events_df = get_subjects_events()
baseline_events_df = events_df[events_df["eventname"] == "baseline_year_1_arm_1"] 
print("There are {} baseline events for {} subjects!".format(len(baseline_events_df), len(subjects_df)))

There are 9530 baseline events for 9879 subjects!


In [5]:
# Add mental health outcomes
CBCL_SCORES = {"cbcl_scr_syn_anxdep_t": "Anxious/Dep.",
             "cbcl_scr_syn_withdep_t": "Depression",
             "cbcl_scr_syn_somatic_t": "Somatic",
             "cbcl_scr_syn_social_t": "Social",
             "cbcl_scr_syn_attention_t": "Attention",
             "cbcl_scr_syn_rulebreak_t": "Rule-breaking",
             "cbcl_scr_syn_aggressive_t": "Aggressive",
             "cbcl_scr_syn_internal_t": "Internalizing",
             "cbcl_scr_syn_external_t": "Externalizing"}    

mental_health_file = os.path.join(core_path, "mental-health", "mh_p_cbcl.csv")
mh_events_df = add_event_vars(baseline_events_df, mental_health_file, vars=list(CBCL_SCORES.keys()))
print("From those, there are {} events with mental health scores".format(len(mh_events_df)))

  df = pd.read_csv(path, sep=sep)


From those, there are 9528 events with mental health scores


In [6]:
# Count missing values
"""
for column in mh_events_df.columns:
    nr_missing_values = mh_events_df[[column]].isna().sum().sum()
    if nr_missing_values > 0:
        print("Column {} has {} missing values".format(column, nr_missing_values))
"""
# Remove missing values
mh_events_df = mh_events_df.dropna()
print("After removing missing values, the df has {} events".format(len(mh_events_df)))

After removing missing values, the df has 9495 events


In [7]:
# Plot distribution of variables
import pygal
from pygal import Config
from abcd.plotting.pygal.rendering import display_html

plot = pygal.Box()
plot.title = 'Child behavior checklist CBCL syndrome scales'
for var_id, var_name in CBCL_SCORES.items():
    plot.add(var_name, list(mh_events_df[var_id]))
display_html(plot)


In [8]:
# Normalize in the (0, 1) range
from abcd.data.var_tailoring.normalization import normalize_var

for var_id in CBCL_SCORES.keys():
    mh_events_df = normalize_var(mh_events_df, var_id, var_id)
    
plot = pygal.Box()
plot.title = 'Child behavior checklist CBCL syndrome scales (after normalization)'    
for var_id, var_name in CBCL_SCORES.items():
    plot.add(var_name, list(mh_events_df[var_id]))
display_html(plot)

In [9]:
from abcd.data.define_splits import save_restore_sex_fmri_splits
from sklearn.linear_model import LinearRegression
from abcd.data.NETWORKS import CONNECTIONS
from abcd.data.define_splits import SITES
from abcd.analysis.methods.sklearn_fitting import set_model_preds, model_results_df, plot_results_one_site

site_splits = save_restore_sex_fmri_splits(k=5)
feature_columns = CONNECTIONS
sites_events_train, sites_events_id_test, sites_events_ood_test = dict(), dict(), dict()

In [10]:
# Linear Regression results: Internalizing
y_column = "cbcl_scr_syn_internal_t"
model = LinearRegression()
for site_id in SITES:
    sites_events_train[site_id], sites_events_id_test[site_id], sites_events_ood_test[site_id] = set_model_preds(model, mh_events_df, site_splits, feature_columns, y_column, site_id)
results = model_results_df(sites_events_train, sites_events_id_test, sites_events_ood_test, y_column)
results  

100%|█████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 366.70it/s]


Unnamed: 0,split,MAE mean,MAE std,Max. error mean,Max. error std
0,Train,0.153322,0.000379,0.697271,0.008307
1,ID Test,0.154966,0.000912,0.667171,0.008141
2,OOD Test,0.154817,0.006641,0.589719,0.058785


In [11]:
plot_results_one_site(sites_events_train, sites_events_id_test, sites_events_ood_test, y_column, y_column_name=CBCL_SCORES[y_column], site_id="site01")

In [16]:
# Now train a fully-connected network
from sklearn.neural_network import MLPRegressor
sites_events_train, sites_events_id_test, sites_events_ood_test = dict(), dict(), dict()
y_column = "cbcl_scr_syn_internal_t"
model = MLPRegressor(solver='adam', learning_rate_init=0.001, alpha=1e-5, activation='relu', hidden_layer_sizes=(50, 10), max_iter=1000, random_state=1, verbose=True)
for site_id in SITES:
    sites_events_train[site_id], sites_events_id_test[site_id], sites_events_ood_test[site_id] = set_model_preds(model, mh_events_df, site_splits, feature_columns, y_column, site_id)
results = model_results_df(sites_events_train, sites_events_id_test, sites_events_ood_test, y_column)
results

Iteration 1, loss = 0.01950107
Iteration 2, loss = 0.01856586
Iteration 3, loss = 0.01832129
Iteration 4, loss = 0.01820127
Iteration 5, loss = 0.01797399
Iteration 6, loss = 0.01788521
Iteration 7, loss = 0.01782256
Iteration 8, loss = 0.01771686
Iteration 9, loss = 0.01771947
Iteration 10, loss = 0.01774086
Iteration 11, loss = 0.01751042
Iteration 12, loss = 0.01736386
Iteration 13, loss = 0.01724060
Iteration 14, loss = 0.01714968
Iteration 15, loss = 0.01707572
Iteration 16, loss = 0.01699607
Iteration 17, loss = 0.01695212
Iteration 18, loss = 0.01687953
Iteration 19, loss = 0.01672484
Iteration 20, loss = 0.01664353
Iteration 21, loss = 0.01652994
Iteration 22, loss = 0.01657122
Iteration 23, loss = 0.01644899
Iteration 24, loss = 0.01627622
Iteration 25, loss = 0.01633504
Iteration 26, loss = 0.01629409
Iteration 27, loss = 0.01614381
Iteration 28, loss = 0.01608089
Iteration 29, loss = 0.01599382
Iteration 30, loss = 0.01581189
Iteration 31, loss = 0.01585987
Iteration 32, los

Iteration 4, loss = 0.01795612
Iteration 5, loss = 0.01798040
Iteration 6, loss = 0.01775994
Iteration 7, loss = 0.01772086
Iteration 8, loss = 0.01758350
Iteration 9, loss = 0.01758292
Iteration 10, loss = 0.01738742
Iteration 11, loss = 0.01735304
Iteration 12, loss = 0.01731188
Iteration 13, loss = 0.01751771
Iteration 14, loss = 0.01721701
Iteration 15, loss = 0.01712723
Iteration 16, loss = 0.01692806
Iteration 17, loss = 0.01679152
Iteration 18, loss = 0.01678169
Iteration 19, loss = 0.01675072
Iteration 20, loss = 0.01675753
Iteration 21, loss = 0.01671300
Iteration 22, loss = 0.01669713
Iteration 23, loss = 0.01649869
Iteration 24, loss = 0.01633898
Iteration 25, loss = 0.01662594
Iteration 26, loss = 0.01633065
Iteration 27, loss = 0.01632038
Iteration 28, loss = 0.01613641
Iteration 29, loss = 0.01591822
Iteration 30, loss = 0.01590441
Iteration 31, loss = 0.01569168
Iteration 32, loss = 0.01567258
Iteration 33, loss = 0.01560086
Iteration 34, loss = 0.01539879
Iteration 35, 

Iteration 18, loss = 0.01679851
Iteration 19, loss = 0.01691483
Iteration 20, loss = 0.01664664
Iteration 21, loss = 0.01652514
Iteration 22, loss = 0.01649425
Iteration 23, loss = 0.01634963
Iteration 24, loss = 0.01627275
Iteration 25, loss = 0.01628723
Iteration 26, loss = 0.01610688
Iteration 27, loss = 0.01608731
Iteration 28, loss = 0.01597523
Iteration 29, loss = 0.01587755
Iteration 30, loss = 0.01598340
Iteration 31, loss = 0.01586011
Iteration 32, loss = 0.01559346
Iteration 33, loss = 0.01554454
Iteration 34, loss = 0.01547119
Iteration 35, loss = 0.01540883
Iteration 36, loss = 0.01524719
Iteration 37, loss = 0.01513574
Iteration 38, loss = 0.01500308
Iteration 39, loss = 0.01486702
Iteration 40, loss = 0.01493579
Iteration 41, loss = 0.01475189
Iteration 42, loss = 0.01465330
Iteration 43, loss = 0.01463449
Iteration 44, loss = 0.01446949
Iteration 45, loss = 0.01442243
Iteration 46, loss = 0.01433384
Iteration 47, loss = 0.01426140
Iteration 48, loss = 0.01407813
Iteratio

Iteration 42, loss = 0.01447073
Iteration 43, loss = 0.01459893
Iteration 44, loss = 0.01429321
Iteration 45, loss = 0.01434695
Iteration 46, loss = 0.01430572
Iteration 47, loss = 0.01401079
Iteration 48, loss = 0.01401230
Iteration 49, loss = 0.01386948
Iteration 50, loss = 0.01386578
Iteration 51, loss = 0.01373369
Iteration 52, loss = 0.01368363
Iteration 53, loss = 0.01375785
Iteration 54, loss = 0.01344999
Iteration 55, loss = 0.01350228
Iteration 56, loss = 0.01340315
Iteration 57, loss = 0.01344304
Iteration 58, loss = 0.01320996
Iteration 59, loss = 0.01308485
Iteration 60, loss = 0.01294221
Iteration 61, loss = 0.01299818
Iteration 62, loss = 0.01277066
Iteration 63, loss = 0.01280006
Iteration 64, loss = 0.01285506
Iteration 65, loss = 0.01266449
Iteration 66, loss = 0.01289157
Iteration 67, loss = 0.01261574
Iteration 68, loss = 0.01278067
Iteration 69, loss = 0.01273665
Iteration 70, loss = 0.01263676
Iteration 71, loss = 0.01232248
Iteration 72, loss = 0.01239820
Iteratio

Iteration 78, loss = 0.01184732
Iteration 79, loss = 0.01197395
Iteration 80, loss = 0.01197420
Iteration 81, loss = 0.01177716
Iteration 82, loss = 0.01162517
Iteration 83, loss = 0.01161356
Iteration 84, loss = 0.01151356
Iteration 85, loss = 0.01155591
Iteration 86, loss = 0.01150108
Iteration 87, loss = 0.01184463
Iteration 88, loss = 0.01146327
Iteration 89, loss = 0.01137408
Iteration 90, loss = 0.01135446
Iteration 91, loss = 0.01143854
Iteration 92, loss = 0.01131482
Iteration 93, loss = 0.01135755
Iteration 94, loss = 0.01126427
Iteration 95, loss = 0.01119778
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.01952294
Iteration 2, loss = 0.01847807
Iteration 3, loss = 0.01819200
Iteration 4, loss = 0.01826325
Iteration 5, loss = 0.01799661
Iteration 6, loss = 0.01777765
Iteration 7, loss = 0.01772701
Iteration 8, loss = 0.01774988
Iteration 9, loss = 0.01759101
Iteration 10, loss = 0.01741986
Iteration 11, loss = 0.

Iteration 100, loss = 0.01114145
Iteration 101, loss = 0.01108858
Iteration 102, loss = 0.01109789
Iteration 103, loss = 0.01106558
Iteration 104, loss = 0.01127206
Iteration 105, loss = 0.01105849
Iteration 106, loss = 0.01115112
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.01944363
Iteration 2, loss = 0.01851040
Iteration 3, loss = 0.01815527
Iteration 4, loss = 0.01810012
Iteration 5, loss = 0.01787100
Iteration 6, loss = 0.01789546
Iteration 7, loss = 0.01761618
Iteration 8, loss = 0.01756906
Iteration 9, loss = 0.01761559
Iteration 10, loss = 0.01764344
Iteration 11, loss = 0.01755694
Iteration 12, loss = 0.01732683
Iteration 13, loss = 0.01729197
Iteration 14, loss = 0.01708851
Iteration 15, loss = 0.01695376
Iteration 16, loss = 0.01692934
Iteration 17, loss = 0.01676302
Iteration 18, loss = 0.01681292
Iteration 19, loss = 0.01665995
Iteration 20, loss = 0.01661383
Iteration 21, loss = 0.01655571
Iteration 22, lo

Iteration 82, loss = 0.01169781
Iteration 83, loss = 0.01172875
Iteration 84, loss = 0.01150864
Iteration 85, loss = 0.01158935
Iteration 86, loss = 0.01162285
Iteration 87, loss = 0.01153326
Iteration 88, loss = 0.01157586
Iteration 89, loss = 0.01142903
Iteration 90, loss = 0.01141909
Iteration 91, loss = 0.01149287
Iteration 92, loss = 0.01127458
Iteration 93, loss = 0.01126408
Iteration 94, loss = 0.01116291
Iteration 95, loss = 0.01115924
Iteration 96, loss = 0.01113355
Iteration 97, loss = 0.01092663
Iteration 98, loss = 0.01100613
Iteration 99, loss = 0.01113099
Iteration 100, loss = 0.01085031
Iteration 101, loss = 0.01085502
Iteration 102, loss = 0.01093805
Iteration 103, loss = 0.01083206
Iteration 104, loss = 0.01070554
Iteration 105, loss = 0.01068902
Iteration 106, loss = 0.01067459
Iteration 107, loss = 0.01072951
Iteration 108, loss = 0.01079499
Iteration 109, loss = 0.01065958
Iteration 110, loss = 0.01048851
Iteration 111, loss = 0.01055418
Iteration 112, loss = 0.0104

Iteration 30, loss = 0.01556115
Iteration 31, loss = 0.01553853
Iteration 32, loss = 0.01539553
Iteration 33, loss = 0.01541367
Iteration 34, loss = 0.01533826
Iteration 35, loss = 0.01517284
Iteration 36, loss = 0.01503708
Iteration 37, loss = 0.01487472
Iteration 38, loss = 0.01484873
Iteration 39, loss = 0.01471804
Iteration 40, loss = 0.01458525
Iteration 41, loss = 0.01447770
Iteration 42, loss = 0.01454491
Iteration 43, loss = 0.01427503
Iteration 44, loss = 0.01436102
Iteration 45, loss = 0.01408287
Iteration 46, loss = 0.01407297
Iteration 47, loss = 0.01402919
Iteration 48, loss = 0.01416267
Iteration 49, loss = 0.01378410
Iteration 50, loss = 0.01372054
Iteration 51, loss = 0.01362627
Iteration 52, loss = 0.01348259
Iteration 53, loss = 0.01385782
Iteration 54, loss = 0.01342785
Iteration 55, loss = 0.01329906
Iteration 56, loss = 0.01318702
Iteration 57, loss = 0.01328253
Iteration 58, loss = 0.01311654
Iteration 59, loss = 0.01291685
Iteration 60, loss = 0.01295667
Iteratio

Iteration 32, loss = 0.01535774
Iteration 33, loss = 0.01524656
Iteration 34, loss = 0.01504486
Iteration 35, loss = 0.01504087
Iteration 36, loss = 0.01471172
Iteration 37, loss = 0.01473821
Iteration 38, loss = 0.01481417
Iteration 39, loss = 0.01449856
Iteration 40, loss = 0.01438433
Iteration 41, loss = 0.01436989
Iteration 42, loss = 0.01427481
Iteration 43, loss = 0.01433215
Iteration 44, loss = 0.01410334
Iteration 45, loss = 0.01394846
Iteration 46, loss = 0.01392031
Iteration 47, loss = 0.01398016
Iteration 48, loss = 0.01389089
Iteration 49, loss = 0.01383191
Iteration 50, loss = 0.01360917
Iteration 51, loss = 0.01385691
Iteration 52, loss = 0.01347444
Iteration 53, loss = 0.01330239
Iteration 54, loss = 0.01335338
Iteration 55, loss = 0.01326466
Iteration 56, loss = 0.01339644
Iteration 57, loss = 0.01331315
Iteration 58, loss = 0.01307303
Iteration 59, loss = 0.01308059
Iteration 60, loss = 0.01299265
Iteration 61, loss = 0.01292575
Iteration 62, loss = 0.01279496
Iteratio

Iteration 4, loss = 0.01822960
Iteration 5, loss = 0.01826894
Iteration 6, loss = 0.01816409
Iteration 7, loss = 0.01802135
Iteration 8, loss = 0.01778664
Iteration 9, loss = 0.01766726
Iteration 10, loss = 0.01757652
Iteration 11, loss = 0.01763421
Iteration 12, loss = 0.01748296
Iteration 13, loss = 0.01732638
Iteration 14, loss = 0.01735590
Iteration 15, loss = 0.01719658
Iteration 16, loss = 0.01725351
Iteration 17, loss = 0.01701744
Iteration 18, loss = 0.01704705
Iteration 19, loss = 0.01703313
Iteration 20, loss = 0.01688348
Iteration 21, loss = 0.01665798
Iteration 22, loss = 0.01659869
Iteration 23, loss = 0.01658187
Iteration 24, loss = 0.01643892
Iteration 25, loss = 0.01637672
Iteration 26, loss = 0.01620755
Iteration 27, loss = 0.01614703
Iteration 28, loss = 0.01607193
Iteration 29, loss = 0.01596026
Iteration 30, loss = 0.01615872
Iteration 31, loss = 0.01594896
Iteration 32, loss = 0.01568207
Iteration 33, loss = 0.01556225
Iteration 34, loss = 0.01543412
Iteration 35, 

100%|█████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 393.45it/s]


Unnamed: 0,split,MAE mean,MAE std,Max. error mean,Max. error std
0,Train,0.116094,0.005684,0.678926,0.108777
1,ID Test,0.185707,0.00657,2.162833,0.635142
2,OOD Test,0.182844,0.014795,0.916481,0.407493


In [22]:
plot_results_one_site(sites_events_train, sites_events_id_test, sites_events_ood_test, y_column, y_column_name=CBCL_SCORES[y_column], site_id="site01")