# Calculate numbers for subject selection flow chart

In [2]:
import numpy as np
import pandas as pd

import src.data.preprocess_data as prep
import src.data.var_names as var_names
from definitions import REPO_ROOT, PROCESSED_DATA_DIR, RAW_DATA_DIR

In [3]:
# Seed for one child selection was 77 in our study
seed = 77

In [5]:
binary_diagnoses_df = prep.create_binary_diagnoses_df(RAW_DATA_DIR)
sri24_df = prep.load_sri24_df(RAW_DATA_DIR)
freesurfer_df = prep.load_freesurfer_df(RAW_DATA_DIR)
sociodem_df = prep.load_sociodem_df(RAW_DATA_DIR)

print(f"Number of subjects with...")
print(f"   ...complete SRI24 data: {len(sri24_df.dropna())}")

abcd_data_df = sri24_df.merge(
    right=freesurfer_df, how='inner', left_index=True, right_index=True
)
print(f"   ...complete SRI24 and Freesurfer data: {len(abcd_data_df.dropna())}")

abcd_data_df = abcd_data_df.merge(
    right=sociodem_df, how='inner', left_index=True, right_index=True
)
print(f"   ...complete SRI24, Freesurfer, and sociodemographic data: {len(abcd_data_df.dropna())}")

abcd_data_df = abcd_data_df.merge(
    right=binary_diagnoses_df, how='inner', left_index=True, right_index=True
)
print(f"   ...complete SRI24, Freesurfer, sociodemographic, and KSADS data: {len(abcd_data_df.dropna())}")

abcd_data_df = prep.select_one_child_per_family(
    abcd_data_path=RAW_DATA_DIR,
    abcd_df=abcd_data_df,
    random_state=seed
)
print(f"   ...with only one subject per family: {len(abcd_data_df.dropna())}")

Number of subjects with...
   ...complete SRI24 data: 8670
   ...complete SRI24 and Freesurfer data: 8495
   ...complete SRI24, Freesurfer, and sociodemographic data: 8433
   ...complete SRI24, Freesurfer, sociodemographic, and KSADS data: 8281
   ...with only one subject per family: 6916
