In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from abcd.local.paths import core_path, output_path
from abcd.data.read_data import get_subjects_events, add_event_vars
from abcd.data.define_splits import SITES, save_restore_sex_fmri_splits

In [3]:
# Fetch subjects and events
subjects_df, events_df = get_subjects_events()

In [4]:
# Add substance use beh
SU_SCORES = {"tlfb_alc_sip": "Ever tried?",
             "su_isip_1_calc": "Total times 1 sip",
             "isip_1b_yn": "Ever alcohol not in religious ceremony",
             "tlfb_alc_reg": "Regular drinking",
             "tlfb_alc_lt": "Total drinks",
             "xskipout_alc": "Drinking days past 6 months"}    

su_file = os.path.join(core_path, "substance-use", "su_y_sui.csv")
su_events_df = add_event_vars(events_df, su_file, vars=list(SU_SCORES.keys()))

  df = pd.read_csv(path, sep=sep)


In [5]:
print(len(su_events_df))

19605


In [6]:
# Get summary on variable values
info_summary = {"variable": [], "# missing values": [], "# values == 0": [], "# values >= 1": []}

for su_key, su_value in SU_SCORES.items():
    info_summary["variable"].append(su_value)
    info_summary["# missing values"].append(su_events_df[su_key].isnull().sum())
    info_summary["# values == 0"].append(len(su_events_df.loc[su_events_df[su_key] == 0]))
    info_summary["# values >= 1"].append(len(su_events_df.loc[su_events_df[su_key] >= 1]))

info_summary = pd.DataFrame(info_summary)
info_summary

Unnamed: 0,variable,# missing values,# values == 0,# values >= 1
0,Ever tried?,10372,7032,2201
1,Total times 1 sip,10082,7328,2195
2,Ever alcohol not in religious ceremony,17404,521,1680
3,Regular drinking,19590,14,1
4,Total drinks,19590,1,14
5,Drinking days past 6 months,19604,0,1


In [7]:
# Filter out missing values from the sufficiently supported scores
SU_SCORES = {"tlfb_alc_sip": "Ever tried?",
             "su_isip_1_calc": "Total times 1 sip"}
su_events_df = su_events_df[(su_events_df["tlfb_alc_sip"].notnull()) & (su_events_df["su_isip_1_calc"].notnull())]
print("There are {} events after filtering out missing values".format(len(su_events_df)))

There are 9229 events after filtering out missing values


In [8]:
# Plot distribution of sips
import pygal
from pygal import Config
from abcd.plotting.pygal.rendering import display_html

capped_sips = [x if x < 50 else 50 for x in list(su_events_df["su_isip_1_calc"])]
su_events_df["su_isip_1_calc_capped"] = capped_sips
plot = pygal.Box()
plot.title = 'Total times subjects had one sip of an alcoholic drink, capped at 50'
plot.add("# times 1 sip", list(su_events_df["su_isip_1_calc_capped"]))
display_html(plot)


In [9]:
# Number of "0" va "1" subjects for each site
import numpy as np
from abcd.data.define_splits import SITES
from abcd.plotting.pygal.colors import CAT_COLORS

from abcd.data.define_splits import SITES, save_restore_sex_fmri_splits
k = 5
splits = save_restore_sex_fmri_splits(k)

custom_style = pygal.style.Style(
    colors=tuple([CAT_COLORS['split'][str(split_ix)] for split_ix in range(5)])
    #,background='transparent'
    )

bar_chart = pygal.Bar(x_label_rotation=45, style=custom_style)
bar_chart.title = '# visits reporting trying alcohol on each site'
bar_chart.x_labels = [x.replace("site", "site ") for x in SITES]
for split_ix in range(k):
    per_site_values = []
    for site_id in SITES:
        subjects_in_split = splits[site_id][str(split_ix)]
        nr_visits = len(
            su_events_df.loc[(su_events_df["src_subject_id"].isin(subjects_in_split)) & (su_events_df['tlfb_alc_sip'] == 1)]
        )
        per_site_values.append(nr_visits)
    bar_chart.add(str(split_ix), per_site_values)
display_html(bar_chart)

"""

per_site_0_values = [len(su_events_df.loc[(su_events_df["tlfb_alc_sip"] == 0) & (su_events_df["site_id_l"] == site_id)]) for site_id in SITES]
per_site_1_values = [len(su_events_df.loc[(su_events_df["tlfb_alc_sip"] == 1) & (su_events_df["site_id_l"] == site_id)]) for site_id in SITES]
bar_chart.add(str("Never had"), per_site_0_values)
bar_chart.add(str("Had"), per_site_1_values)
display_html(bar_chart)
"""

'\n\nper_site_0_values = [len(su_events_df.loc[(su_events_df["tlfb_alc_sip"] == 0) & (su_events_df["site_id_l"] == site_id)]) for site_id in SITES]\nper_site_1_values = [len(su_events_df.loc[(su_events_df["tlfb_alc_sip"] == 1) & (su_events_df["site_id_l"] == site_id)]) for site_id in SITES]\nbar_chart.add(str("Never had"), per_site_0_values)\nbar_chart.add(str("Had"), per_site_1_values)\ndisplay_html(bar_chart)\n'