In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import csv
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.api as sms
import scipy
import datetime
from pipeline import data, features, consts
import math

In [None]:
# Load the data
datafile = Path("/mnt/c/Users/anbag/Box Sync/Research/UVA/Medication Adherance/MEMS dataset/final_merged_set_v6.csv")
df = pd.read_csv(datafile, parse_dates=False)
df.head()
# Testing pre-commit hook for nbstripout
# Another notebook output strip test

In [None]:
# Instantiate a Dataset class
dataset = data.Dataset(df)
dataset

# Preprocessing and Feature Engineering

In [None]:
# Specify columns to rename - want this project to be human-readable!
# TODO: Finish renaming from followup medical record extraction form
# Then, normalization, etc

'''
Note - I manually created any needed dummy vars in SPSS and named them what I wanted
Any dummy vars included in this renaming dictionary are vars leftover from Kristi's analysis that
weren't renamed in place 
''' 
rename_dict = {
    'demographics': {
        'EDU_RECODE':'education', 
        'Country': 'country',
        'A_DEMO1': 'age',
        'A_DEMO31': 'race_white',
        'A_DEMO32': 'race_black',
        'A_DEMO33': 'race_asian',
        'A_DEMO34': 'race_native_hawaiian_pacific_islander',
        'A_DEMO35': 'race_native_american',
        'A_DEMO36': 'race_other',
        'A_DEMO37': 'race_prefer_not_to_answer',
        'A_DEMO38': 'race_white',
        'A_DEMO2': 'hispanic_latina',
        'A_DEMO10': 'years_in_usa',
        'A_DEMO11': 'primary_lang',
        'A_DEMO13YN': 'take_meds_regularly',
    },
    'medical': { # From medical record abstraction forms (pre/post)
        'stage_recoded': 'stage',
        'A_MR1': 'pre_insurance_status',
        'A_MR3': 'pre_dx_date',
        
        # --- Treatments received prior to enrollment --- 
        'A_MR4_YN': 'pre_radiation',
        'A_MR5_YN': 'pre_chemo',
        'A_MR6_YN': 'pre_surgery',
        'A_MR7_YN': 'pre_reconstructive_surgery',
         #  --- Drugs prescribed prior to enrollment --- 
        'A_MR9_tamox_YN': 'pre_tamoxifen',
        'A_MR9_ralox_YN': 'pre_raloxifene',
        'A_MR9_torem_YN': 'pre_toremifene',
        'A_MR9_anas_YN': 'pre_anastrazole',
        'A_MR9_exem_YN': 'pre_exemestane',
        'A_MR9_let_YN': 'pre_letrozole',
        'A_MR9_gose_YN': 'pre_goserelin',
        #  --- Comorbid conditions prior to enrollment --- 
        'A_MR10_a': 'pre_myocardial_infarction', 
        'A_MR10_b': 'pre_cong_heart_failure',
        'A_MR10_c': 'pre_peripheral_vascular_disease',
        'A_MR10_d': 'pre_cerebrovascular_disease',
        'A_MR10_e': 'pre_dementia',
        'A_MR10_f': 'pre_chromic_pulm_disease',
        'A_MR10_g': 'pre_conn_tissue_disease',
        'A_MR10_h': 'pre_peptic_ulcer_disease',
        'A_MR10_i': 'pre_liver_disease_mild',
        'A_MR10_j': 'pre_diabetes',
        'A_MR10_k': 'pre_hemiplegia',
        'A_MR10_l': 'pre_renal_disease',
        'A_MR10_m': 'pre_diabetes_and_organdamage',
        'A_MR10_n': 'pre_leukemia',
        'A_MR10_o': 'pre_lymphoma',
        'A_MR10_p': 'pre_liver_disease_mod_severe',
        'A_MR10_q': 'pre_met_tumor',
        'A_MR10_r': 'pre_aids',
        'A_MR10_s': 'pre_cancer_other',
        'A_MR10_t': 'pre_arthritis',
        'A_MR10_u': 'pre_mental_illness',
        # ---- Procedures recommended / received since enrollment --- 
        'C_MR3': 'post_num_exams_recommended',
        'C_MR4': 'at_least_one_exam',
        'C_MR5': 'post_all_exams_obtained',
        'C_MR6': 'post_num_mammograms_recommended',
        'C_MR7_RECODED': 'post_any_mammograms_received',
        'C_MR7_YN_RECODED': 'post_all_mammograms_received',
        # ---- Drugs prescribed since enrollment ---
        'C_MR9_tamox_YN': 'post_tamoxifen',
        # Note that `evist` and `ralox` prefixes both refer to same med
        'A_MR9_evist_YN': 'post_raloxifene', 
        # Same note as above; different prefixes used here, but refer to same respective meds
        'A_MR9_fares_YN': 'post_toremifene',
        'A_MR9_arim_YN': 'post_anastrazole',
        'A_MR9_aroma_YN': 'post_exemestane',
        'A_MR9_femara_YN': 'post_letrozole',
        'A_MR9_zola_YN': 'post_goserelin', 
        'C_MR10': 'post_changed_meds',
        'C_MR12': 'post_side_effects',
        'C_MR18': 'post_num_appts_scheduled',
        'C_MR19': 'post_num_appts_canceled_by_patient',
        'C_MR20': 'post_num_appts_missed',
        # C_MR21 through C_MR23 definitely have low variance - not going to bother to rename / include
        'C_MR24_MRI': 'post_received_mri',
        'C_MR24_US': 'post_received_ultrasound',
        'C_MR24_BI': 'post_received_biopsy',
        'C_MR24_CT': 'post_received_ct_scan',
        'C_MR24_BS': 'post_received_bone_scan',
        'C_MR24_AS': 'post_received_addnl_surgery',
        'C_MR24_AR': 'post_received_addnl_radiation',
        'C_MR24_GT': 'post_received_genetic_couns_test',
        'C_MR24_GE': 'post_received_gyn_exam',
        'C_MR24_Otherbreastsurgery': 'post_received_other_breast_surgery',
        # C_MR25 through C_MR27 definitely have low variance - not going to bother to rename / include
    }
}
# Specify columns to drop
drop_dict = [col for col in dataset.df.columns if '_Name' in col]

# Tidy the dataset
dataset.clean({**rename_dict['demographics'], **rename_dict['medical']}, drop_dict)

# See if it worked
dataset.df.head()

## Static Features

Generate static (non-temporal) features from measures such as validate instruments (e.g., FACTB)

In [None]:
# TODO - retain only rows (columns?) with 80% of data

''' 
 Create a dictionary of candidate features
 Keys are categories that are useful if we want to reference certain groups of features
 Values are lists of columns corresponding their feature categories
 '''

feat_cols = {
    'demographics': [v for v in rename_dict['demographics'].values()] + \
                    [col for col in dataset.df.columns if 'education_' in col] + \
                    [col for col in dataset.df.columns if 'marital_status' in col] + \
                    [col for col in dataset.df.columns if 'employment' in col] + \
                    [col for col in dataset.df.columns if 'income' in col] + \
                    [col for col in dataset.df.columns if 'birth_country' in col] + \
                    [col for col in dataset.df.columns if 'primary_lang' in col]
                    ,
    'study_behavior': ['DateEnroll', 'complete_4', 'complete_8', 'memsuse', 'deceased'],
    'medical': [v for v in rename_dict['medical'].values()] + \
               [col for col in dataset.df.columns if 'insurance_status' in col] + \
               [col for col in dataset.df.columns if 'followup_elsewhere' in col] + \
               ['stage', 'early_late', 'diagtoenroll']
}

'''
This dataset has several repeated measures for validated instruments, such as the FACTB
Columns for repeated measures for the same instrument share a suffix (e.g., '_FACTB')
Use regex to populate the `scores` category subdictionary quickly, using these suffixes

TODO: fix this - add back in
''' 

# for k,v in consts.SCORES.items():
#     feat_cols['scores_' + k] = list(
#         dataset.df.filter(regex='^[A-C]' + v['suffix'] + '$').columns
#     )

'''
Specify the feature column dtypes explicitly
Prefer this to one-by-one column spec often used with pandas
''' 
dtypes = {
    'numeric': list(itertools.chain(*[v for k,v in feat_cols.items() if 'date' not in k])),
    'datetime': list(itertools.chain(*[v for k,v in feat_cols.items() if 'date' in k]))
}

# '''Set the features and dtypes ''' 
# dataset.update_features(feat_cols, dtypes)

dataset.df.head()


In [None]:
dtypes

In [None]:
# Create new columns for several demographic and medical variables
demog_drug_cols = [col for col in dataset.df.columns if 'A_DEMO13DRUG' in col]
dataset.df['DEMOG_numdrugs'] = dataset.df[demog_drug_cols].count(axis=1)

post_exam_cols = [col for col in dataset.df.columns if 'C_MR5_date' in col]
dataset.df[post_exam_cols] = dataset.df[post_exam_cols].apply(
    lambda x: pd.to_datetime(x, errors='coerce')
)
dataset.df['C_numexams'] = dataset.df[post_exam_cols].count(axis=1)

n_examcols = len(post_exam_cols)
for i in range(1,n_examcols):
    curr_col = post_exam_cols[i]
    prev_col = post_exam_cols[i-1]
    dataset.df[curr_col + '_days_since_last_exam'] = dataset.df[curr_col] - dataset.df[prev_col]
    dataset.df[curr_col + '_days_since_last_exam'] = dataset.df[curr_col + '_days_since_last_exam'].dt.days
    print(dataset.df[curr_col + '_days_since_last_exam'])
# x.diff().mean().total_seconds() / (60 * 60 * 24) # mean days between exams

# TODO: Dates aren't necessarily in order. Ask Kristi if this is a data entry issue or 
# An ordering issue?


In [None]:
scores = {
    'BACS': {
        'semantic_label': 'Barriers to Care Scale',
        'suffix': '_BACS',
        'max_val': -1 # Scores with max_val == -1 are those left to be calculated
    },
    'BCPT': {
        'semantic_label': 'Breast Cancer Prevention Trial Symptom Checklist',
        'suffix': '_BCPT',
        'max_val': -1
    },
    'BCSK': {
        'semantic_label': 'Breast Cancer Survivorship Knowledge',
        'suffix': '_BCSK',
        'max_val': -1
    },
    'BMQ': {
        'semantic_label': 'Beliefs About Medicines Questionnaire',
        'suffix': '_BMQ',
        'max_val': -1
    },
    'CASE': {
        'semantic_label': 'CASE',
        'suffix': '_CASE',
        'max_val': -1
    },
    'DECREG': {
        'semantic_label': 'Decision Regret Scale',
        'suffix': '_DECREG',
        'max_val': -1
    },
    'FACIT_SP': {
        'semantic_label': 'FACIT-SP',
        'suffix': '_FACITSP',
        'max_val': -1
    },
    'FACT_B': {
        'semantic_label': 'FACT-B',
        'suffix': '_FACTB',
        'max_val': 148
    },
    'FACT_BC': {
        'semantic_label': 'FACT Breast Cancer Subscale',
        'suffix': '_BC',
        'max_val': 40
    },
    'FACT_G_PWB': {
        'semantic_label': 'FACT-G Physical Well-Being Subscale',
        'suffix': '_PWB',
        'max_val': 28
    },
    'FACT_G_SWB': {
        'semantic_label': 'FACT-G Social Physical Well-Being Subscale',
        'suffix': '_SWB',
        'max_val': 28
    },
    'FACT_G_EWB': {
        'semantic_label': 'FACT-G Emotional Physical Well-Being Subscale',
        'suffix': '_EWB',
        'max_val': 28
    },
    'FACT_G_FWB': {
        'semantic_label': 'FACT-G Functional Physical Well-Being Subscale',
        'suffix': '_FWB',
        'max_val': 28
    },
    'FACT_G': {
        'semantic_label': 'FACT-G',
        'suffix': '_FACTG',
        'max_val': 108
    },
    'MDASI': {
        'semantic_label': 'MD Anderson Symptom Inventory',
        'suffix': '_MDASI',
        'max_val': -1
    },
    'MASES': {
        'semantic_label': 'Medication Adherence Self-Efficacy Scale',
        'suffix': '_MASES',
        'max_val': -1
    },
    'MEDAD': {
        'semantic_label': 'Medication Adherence Scale',
        'suffix': '_MEDAD',
        'max_val': -1
    },
    'PEARL': {  # I think Kristi said don't use this
        'semantic_label': 'Pearlin Mastery Scale',
        'suffix': '_PEARL',
        'max_val': -1
    },
    'PTGI': {
        'semantic_label': 'Posttraumatic Growth Inventory',
        'suffix': '_PTGI',
        'max_val': -1
    },
    'PSAT': {
        'semantic_label': 'Patient Satisfaction',
        'suffix': '_PSAT',
        'max_val': -1
    },
    'PSS': {
        'semantic_label': 'Perceived Stress Scale',
        'suffix': '_PSS',
        'max_val': -1
    },
    'PSUSP': {
        'semantic_label': 'Perceived Susceptibility Scale',
        'suffix': '_PSUSP',
        'max_val': -1
    },
    'SS': {  
        'semantic_label:' # Krause and Borwaski-Clark Social Support Scale
        'suffix': '_SS',
        'max_val': -1
    },

}

In [None]:
'''
Create new columns of metrics for the validated instruments (e.g., mean, std)
''' 
for k in consts.SCORES.keys():
    score_category = 'scores_' + k
    cols = dataset.features[score_category]
    dataset.df, newcols = features.calc_standard_static_metrics(df=dataset.df, cols=cols, col_prefix=k)
    
    # Add new columns to features dictionary
    feat_cols[score_category] = feat_cols[score_category] + newcols

# Update the features
dataset.update_features(feat_cols)


## Dynamic (Temporal Features)

Extract temporal features by converting main dataset's df from wide-form to long-form
If data was already in long form, we wouldn't need all these steps. Alas...

In [None]:
id_cols = ['PtID', 'MemsNum', 'DateEnroll']

df = pd.DataFrame()

# Get a list of all date columns
date_cols = list(dataset.df.filter(regex='date\d{3}$').columns)

i = 0
for col in date_cols:

    ''' Find all the time cols for that date col'''
    time_cols = list(dataset.df.filter(
        regex='MEMS_{date_col}_time\d{{1}}$'.format(date_col=col)).columns)  

    '''
        Perform a melt so we get MEMS events stratified by patient
        Be sure to include the "within range" column as one of the id_vars
    ''' 
    additional_cols = [
        {
            'original': 'MEMS_' + col + '_withinrange',
            'new': 'adherent_today'
        },
        {
            'original': 'MEMS_' + col + '_numtimes',
            'new': 'num_times_used_today'
        },
    ]
    if i > 0: # The first date won't have an interval
        additional_cols.append({
            'original': 'MEMS_' + col + '_interval',
            'new': 'interval'
        })
    
    all_id_cols = id_cols + [col] + [x['original'] for x in additional_cols]
    
    res = dataset.df[all_id_cols + time_cols].melt(id_vars = all_id_cols)

    ''' Tidy up the resulting dataframe '''
    res.rename(columns={col: 'date', 'value': 'time'}, 
               inplace=True)
    
    res.rename(columns={x['original']:x['new'] for x in additional_cols},
               inplace=True)

    res.drop(columns=['variable'], inplace=True)
    
    ''' Finally, merge results into the new dataframe ''' 
    if df.empty:
        df = res.copy()
    else:
        df = df.append(res, ignore_index=True)
    i += 1

# Create combined datetime column
df['datetime'] = df.apply(
    lambda x: features.get_datetime_col(x), axis=1
)
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

# Fix dtypes
df[['adherent_today', 'num_times_used_today']] = df[['adherent_today', 'num_times_used_today']].fillna(0).astype(int)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['interval'] = pd.to_timedelta(df['interval'])


# Add binary indicator of any usage (not just number of times used) on a given day
df['used_today'] = df['num_times_used_today'].apply(
    lambda x: 1 if x > 0 else 0
)

# Drop rows with an empty date column
df.dropna(subset=['date'], inplace=True)

# Remove observations that occurred before a subject's enrollment date
# Don't remove empty observations just yet - need to verify original adherence rates are correct
df = df.loc[df['DateEnroll'] < df['date']]

df.head(10)


In [None]:
# Generate epochs of interest (time of day, weekday, day/month of study, etc)
time_of_day_props = {
    'bins': [-1, 6, 12, 18, 24],
    'labels': ['late_night','morning', 'afternoon', 'evening']
}
df = features.get_epochs(df, 'DateEnroll', 'PtID',
                         time_of_day_props['bins'], 
                         time_of_day_props['labels'])
df.head()

In [None]:
df['interval'].dtype

In [None]:
'''
Restrict to the study period (210 days)
Kristi's group used only 210 days of data - ask them why
Epochs are zero-indexed so upper bound is 209

'''
df = df[df['study_day'] <= 209] 

''' 
Start our final temporal features dataframe

First, group by our desired epoch and add standard metrics such as mean, std for a given datetime column 
We want to predict weekly adherence, we we'll group by the study week
We could also swap this out for day or month
'''
groupby_cols = ['PtID', 'MemsNum', 'study_week']

# TODO: Move to consts
SECONDS_IN_HOUR = 3600.0

temporal_feats = features.calc_standard_temporal_metrics(df, groupby_cols, 'datetime')

''' 
Calculate adherence-related metrics
'''

adherence_feats = df.groupby(groupby_cols)['used_today'].agg({
    'usage_rate': lambda x: x.sum() / 7.0
}).reset_index()

temporal_feats = temporal_feats.merge(adherence_feats, on=groupby_cols)

adherence_feats = df.groupby(groupby_cols)['adherent_today'].agg({
    'adherence_rate': lambda x: x.sum() / 7.0
}).reset_index()
temporal_feats = temporal_feats.merge(adherence_feats, on=groupby_cols)

temporal_feats


In [None]:
# Extract static features (e.g., scores)
id_cols=['PtID', 'MemsNum']
static_feats = dataset.build_df_from_features(id_cols=id_cols)

# Create featureset from both static and dynamic (temporal) features
all_feats = static_feats.merge(temporal_feats, on=id_cols)
all_feats.head()

'''
TODO - think about other features that are important (e.g., does month of enrollment matter?)
Add sliding window code and test
Figure out how to normalize heterogeneous vals
Ask Kristi how they got the days of use and percentage vals...
Figure out how to add in cross-validation and tuning, and upsampling in tensorflow
''' 

In [None]:
# Finally, convert labels to prep for modeling
# First, confirm 'percentMEMS8' column is correct (adherence percentage for whole study) 
# Getting diff numbers for days of use than Kristi's group did...hmm
# For now, switch to my own conversion...
all_feats['adherent'] = all_feats['adherence_rate'].apply(
    lambda x: 1 if x > .80 else 0
)

all_feats

# Modeling
Taken from official Tensorflow tutorial on time series data
https://www.tensorflow.org/tutorials/structured_data/time_series

In [None]:
column_indices = {name: i for i, name in enumerate(all_feats.columns)}

n = len(all_feats)
train_df = all_feats[0:int(n*0.7)]
val_df = all_feats[int(n*0.7):int(n*0.9)]
test_df = all_feats[int(n*0.9):]

num_features = df.shape[1]


In [None]:
# TODO Add Normalization

In [None]:
w1 = features.WindowGenerator(input_width=24, label_width=1, shift=1,
                              label_columns=['T (degC)'])
w1

In [None]:
# Stack three slices, the length of the total window:
example_window = tf.stack([np.array(train_df[:w1.total_window_size]),
                           np.array(train_df[100:100+w1.total_window_size]),
                           np.array(train_df[200:200+w1.total_window_size])])


example_inputs, example_labels = w2.split_window(example_window)

print('All shapes are: (batch, time, features)')
print(f'Window shape: {example_window.shape}')
print(f'Inputs shape: {example_inputs.shape}')
print(f'labels shape: {example_labels.shape}')


# Visualization

In [None]:
# '''
#  Categorize each participant according to their score(s) and 
#  add category to each observation
# '''
# score_props = {k + '_mean': {'bins': [-1, v['max_val'] / 3.0, 
#                                       2 * v['max_val'] / 3.0, 
#                                       v['max_val']],
#                              'labels': ['low', 'medium', 'high']
#                             }
#                             for k, v in consts.SCORES.items()}

# dataset = features.gen_categories(dataset, score_props)
# dataset.rename({k + '_mean_cat': k + '_cat' for k in consts.SCORES.keys()},
#                 axis=1,
#                 inplace=True
#               )
# for k in consts.SCORES.keys():
#     target_col = k + '_cat'
#     print(target_col)
#     id_cols = ['PtID', 'MemsNum']
#     mems_df = mems_df.merge(dataset[id_cols + [target_col]], on=id_cols)

# mems_df

In [None]:
# Makes no sense - fix
import seaborn as sns
sns.lineplot(x='study_month', y='adherent', hue='FACTB_cat', data=mems_df)

In [None]:
# Visualize adherence frequency
df = mems_df['PtID'].value_counts().reset_index(name='count')
df.rename(columns={'index': 'PtID'}, inplace=True)
df

dataset.hist(column='percentMEMS8')

# Skewed right - most people took their meds :O

In [None]:
mems_df[['PtID', 'MemsNum'] + feat_cols['demographics'] + 
        feat_cols['study_behavior'] + 
        list(itertools.chain(*[v for k,v in feat_cols['scores'].items()]))]


In [None]:
data[data['percentMEMS8']*100 > 85] # TODO: Check the lit for preferred threshold for adherence


# Possible research questions
- Can MEMS data (frequency, times of day, etc) be used to predict different types of wellbeing? (e.g., social wellbeing)
- Can demographic + wellbeing data at baseline (does it have to just be at baseline?) be used to predict long-term adherence?

- Worth looking at both of these?

- What are the most important determinants of adherence? wellbeing? demographics?
- What kind of phenotypes emerge?

In [None]:
# Get average scores for assessments that were administered at multiple timepoints
for k,v in scores.items():
    newcol = k + '_mean'
    
    # Filtering is necessary here since the FACT-B and FACT-G item-by-item scores are included in the dataset
    data[newcol] = data[feat_cols['scores'][k]].mean(axis=1)
    
    # Be sure to include this new "mean" column in our list of feature columns for this score
    feat_cols['scores'][k].append(newcol)

In [None]:
mean_scores_df = data[[col for col in data.columns if 'mean' in col]]

for col in mean_scores_df.columns:
    fig, ax = plt.subplots()
    s = mean_scores_df[col]
    print(scipy.stats.describe(s))
    print('25th Percentile: ' + str(np.percentile(s, 25)))
    print('75th Percentile: ' + str(np.percentile(s, 75)))
    print('median: ' + str(s.median()))
    sns.distplot(s)
    plt.show()
    
# Most scores skewed toward higher quality of life. Use Median when dividing them up into 'high/low'


In [None]:
# TODO: Ask Kristi about any stat results they've already completed
df = mems_df['PtID'].value_counts().reset_index(name='n_mems_events')
df.rename(columns={'index': 'PtID'}, inplace=True)

df2 = data[['PtID'] + [col for col in data.columns if 'mean' in col]]
for col in df2.columns[1:]:
    df[col + '_group'] = df2[col].apply(lambda x: 'low' if x < df2[col].median() else 'high')

df = df.merge(df2, on='PtID')

group_cols = [col for col in df.columns if 'group' in col]

for col in group_cols:
    df3 = df.groupby([col])['n_mems_events'].mean().reset_index(name='avg_n_mems_events')
    print(df[col].value_counts())
    sns.barplot(y='avg_n_mems_events', x=col, data=df3)
    plt.show()

### Demographics

In [None]:
# Get a demographics dataframe with meaningful labels
dem_df = data[feat_cols['demographics']]
for col in dem_df.columns:
    dem_df[col] = dem_df[col].map(codebook[col])
    
dem_df

In [None]:
for col in dem_df.columns:

    df = dem_df.groupby(
        col
    ).size().reset_index(name='N').sort_values(by=['N'], ascending=False)
    df

    pie, ax = plt.subplots(figsize=[10,6])
    labels = df[col]
    plt.pie(x=df['N'], autopct="%.1f%%", labels=labels, pctdistance=0.5)
    plt.title(col.capitalize(), fontsize=14);
    # pie.savefig('results/figures/demographics_edu.png', bbox_inches="tight")

    df['percentage'] = round(100 * df['N'] / df['N'].sum(), 0)
    # df.to_csv('results/tables/demographics_edu.csv')
    df