In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import math
from datetime import datetime
import re
import csv
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.api as sms
import scipy
from sklearn.feature_selection import VarianceThreshold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import tensorflow as tf
from pipeline import data, features, consts

In [None]:
# Load the data
datafile = Path("/mnt/c/Users/anbag/Box Sync/Research/UVA/Medication Adherance/MEMS dataset/final_merged_set_v6.csv")
#datafile = Path("/mnt/c/Users/ab5bt/Box Sync/Research/UVA/Medication Adherance/MEMS dataset/final_merged_set_v6.csv")
df = pd.read_csv(datafile, parse_dates=False)
df.head()

# Data Cleaning
Thank you to Jason Brownlee
https://machinelearningmastery.com/basic-data-cleaning-for-machine-learning/

In [None]:
# Instantiate a Dataset class
dataset = data.Dataset(df, id_cols = ['PtID', 'MemsNum'])
dataset

In [None]:
# Tidy the dataset - get rid of useless or recoded columns
dataset.clean(to_rename = {**consts.RENAMINGS['demographics'], 
                             **consts.RENAMINGS['medical']}, 
              to_drop=[col for col in dataset.df.columns if '_Name' in col],
              to_map = consts.CODEBOOK,
              to_binarize = ['race_other']
             )

''' 
Set dtypes on remaining columns
For now, naively assume we only have numerics, datetimes, or objects
'''
dtypes_dict = {
    'numeric': [col for col in dataset.df.columns if 'date' not in col.lower()],
    'datetime': ['DateEnroll']
}

dataset.set_dtypes(dtypes_dict)

''' See if it worked '''
dataset.df.head()

In [None]:
# split data into inputs and outputs
X = dataset.df.drop(columns=['percentMEMS8'] + dataset.id_cols, axis=1)
X = X.select_dtypes('number').fillna(-1).values
y = dataset.df['percentMEMS8'].values
print(X.shape, y.shape)

# define thresholds to check
thresholds = np.arange(0.0, 0.55, 0.05)

# apply transform with each threshold
results = list()

for t in thresholds:
    # define the transform
    transform = VarianceThreshold(threshold=t)
    
    # transform the input data
    X_sel = transform.fit_transform(X)
    
    # determine the number of input features
    n_features = X_sel.shape[1]
    print('>Threshold=%.2f, Numeric Features=%d' % (t, n_features))
    
    # store the result
    results.append(n_features)

#plot the threshold vs the number of selected features
plt.plot(thresholds, results)
plt.show()

# Note to self - may want to set a high threshold

# Feature Engineering

## Static Features

In [None]:
''' Handle the special case of race, which was passed in one-hot-encoded
Let's convert it back so we can do imputation a bit later
Make sure to do this before we define our feature columns!
'''
race_cols = [col for col in dataset.df.columns if 'race' in col]
dataset.df['race'] = dataset.df[race_cols].idxmax(1)
dataset.df.drop(race_cols, inplace=True)
dataset.df['race'] 

In [None]:
dataset.df['race']

Generate static (non-temporal) features from measures such as validate instruments (e.g., FACTB)

In [None]:
''' 
 Organize the candidate features into useful categories for later reference
 A bit tedious, but helpful 
'''

# Set our excluded features, before anything else
excluded = ['percentMEMS8']  # Overall adherence rate - unlikely to be used since we're building weekly vectors

feat_cols = {
    'demographics': [v for v in consts.RENAMINGS['demographics'].values()
                     if v in dataset.df.columns] + ['race'], #add the new, single race col
    'study_behavior': [col for col in ['DateEnroll', 'Group', 'complete_4', 
                                       'complete_8', 'memsuse', 'deceased',
                                       'day_miss_fromB', 'day_miss_from7', 'total_days_8'] 
                       if col in dataset.df.columns],
    'medical': [v for v in consts.RENAMINGS['medical'].values() if v in dataset.df.columns] + \
               [col for col in ['stage', 'early_late', 'diagtoenroll'] 
                if col in dataset.df.columns]
}

''' This dataset has several repeated measures for validated instruments, 
such as the FACTB

Columns for repeated measures for the same instrument share a suffix (e.g., '_FACTB')
Use regex to populate the `scores` category subdictionary quickly, using these suffixes
''' 

for k,v in consts.SCORES.items():
    ''' Handle special case of BCPT before doing anything else '''
    if k == 'BCPT':
        dataset.df.drop(
            list(dataset.df.filter(regex = '_BCPT\d*YN$')), 
            axis = 1, 
            inplace = True
        )
        dataset.df.drop(
            list(dataset.df.filter(regex = '_BCPT\d*O$')), 
            axis = 1, 
            inplace = True
        )
    
    '''Some measures weren't precalculated in full for timepoints A, B, and C
    Let's fix this ''' 
    if v['precalculated'] == False:
        
        ''' For each timepoint, get the aggregate score and add it to the dataset
        as a new column'''
        for prefix in ['A', 'B', 'C']:
            score_cols = list(
                dataset.df.filter(regex='^' + prefix + v['suffix'] + '\d*').columns
            )
            dataset.df[prefix + v['suffix']] = dataset.df[score_cols].sum(axis=1)

    ''' Now that we've calculated everything, get the aggregate score for each measure, 
    at each timepoint'''
    feat_cols['scores_' + k] = list(
        dataset.df.filter(regex='^[A-C]' + v['suffix'] + '$').columns
    )
    
''' Create a catch-all category of remaining features, to ensure we got everything '''
feat_cols['other'] = [col for col in dataset.df.columns 
                      if col not in list(itertools.chain(*feat_cols.values())) # exclude anything already in the list
                      and not any(prefix in col for prefix in ['A_', 'B_', 'C_']) # exclude individual score cols
                      and 'date' not in col 
                      and col not in dataset.id_cols
                      and col not in excluded
                     ]

In [None]:
''' Create new columns for several demographic and medical variables
Be sure we update the feature columns dictionary '''
demog_drug_cols = [col for col in dataset.df.columns if 'A_DEMO13DRUG' in col]
newcol = 'DEMOG_numdrugs'
dataset.df[newcol] = dataset.df[demog_drug_cols].count(axis=1)
feat_cols['demographics'] = feat_cols['demographics'] + [newcol]

post_exam_cols = [col for col in dataset.df.columns if 'C_MR5_date' in col]
dataset.df[post_exam_cols] = dataset.df[post_exam_cols].apply(
    lambda x: pd.to_datetime(x, errors='coerce')
)
newcol = 'C_numexams'
dataset.df[newcol] = dataset.df[post_exam_cols].count(axis=1)
feat_cols['medical'] = feat_cols['medical'] + [newcol]

''' Thank you @benvc!
https://stackoverflow.com/questions/54367491/calculate-average-of-days-between-a-list-of-dates
'''

newcol = 'mean_days_betw_exams'
dataset.df[newcol] = dataset.df[post_exam_cols].apply(
    lambda x: features.mean_days_between_dates(x),
    axis=1
)
feat_cols['medical'] = feat_cols['medical'] + [newcol]


# TODO: Dates aren't necessarily in order. Ask Kristi if this is a data entry issue or 
# An ordering issue?

print(dataset.df['DEMOG_numdrugs'].head())
print(dataset.df['C_numexams'].head())
print(dataset.df['mean_days_betw_exams'].head())

## Dynamic (Temporal Features)

Extract temporal features by converting main dataset's df from wide-form to long-form.

In [None]:
df = pd.DataFrame()

# Get a list of all date columns
date_cols = list(dataset.df.filter(regex='date\d{3}$').columns)

i = 0
for col in date_cols:

    ''' Find all the time cols for that date col'''
    time_cols = list(dataset.df.filter(
        regex='MEMS_{date_col}_time\d{{1}}$'.format(date_col=col)).columns)  

    '''
        Perform a melt so we get MEMS events stratified by patient
        Be sure to include the "within range" column as one of the id_vars
    ''' 
    additional_cols = [
        {
            'original': 'MEMS_' + col + '_numtimes',
            'new': 'num_times_used_today'
        }
    ]
    if i > 0: # The first date won't have an interval or withinrange
        additional_cols.append(
            {
                'original': 'MEMS_' + col + '_interval',
                'new': 'interval'
            }
        )
        additional_cols.append(
            {
                'original': 'MEMS_' + col + '_withinrange',
                'new': 'adherent_today'
            }
        )
    
    all_id_cols = dataset.id_cols + ['DateEnroll', col] + [x['original'] for x in additional_cols]
    
    res = dataset.df[all_id_cols + time_cols].melt(id_vars = all_id_cols)
    
    ''' Tidy up the resulting dataframe '''
    res.rename(columns={col: 'date', 'value': 'time', 'variable': 'MEMS_day'}, 
               inplace=True)

    res['MEMS_day'] =  res['MEMS_day'].apply(lambda x: int(re.sub(r'_time\d*$', '', x.split('MEMS_date')[1])))
    
    res.rename(columns={x['original']:x['new'] for x in additional_cols},
               inplace=True)

#     res.drop(columns=['variable'], inplace=True)
    
    ''' Finally, merge results into the new dataframe ''' 
    if df.empty:
        df = res.copy()
    else:
        df = df.append(res, ignore_index=True)
    i += 1

# Create combined datetime column
df['datetime'] = df.apply(
    lambda x: features.get_datetime_col(x), axis=1
)
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

# Fix dtypes
df[['adherent_today', 'num_times_used_today']] = df[['adherent_today', 'num_times_used_today']].fillna(0).astype(int)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['interval'] = pd.to_timedelta(df['interval'])


# Add binary indicator of any usage (not just number of times used) on a given day
df['used_today'] = df['num_times_used_today'].apply(
    lambda x: 1 if x > 0 else 0
)

# Drop rows with an empty date column
df.dropna(subset=['date'], inplace=True)

'''Remove observations that occurred before a subject's enrollment date
Don't remove empty observations just yet
TODO: verify original  adherence rates are correct''' 
df = df.loc[df['DateEnroll'] < df['date']]

df.head()


In [None]:
# Generate epochs of interest (time of day, weekday, day/month of study, etc)
time_of_day_props = {
    'bins': [-1, 6, 12, 18, 24],
    'labels': ['late_night','morning', 'afternoon', 'evening']
}
df = features.get_epochs(df, 'DateEnroll', 'PtID',
                         time_of_day_props['bins'], 
                         time_of_day_props['labels'])
df.head()

In [None]:
# Restrict to 210 days, per Kristi's documentation
df = df[df['MEMS_day'] <= 210] 

In [None]:
# Validate that we calculated days of adherence correctly
df2 = df.groupby(dataset.id_cols + ['MEMS_day'])['adherent_today'].max().reset_index()
df2 = df2.groupby(dataset.id_cols)['adherent_today'].sum().reset_index()
df2 = df2.merge(dataset.df[dataset.id_cols + ['total_days_8']])
df2.head(30)
# Check!

In [None]:
# ---- Start our final temporal features dataframe -----------

# Exclude first month (ramp-up period during which time users were getting used to the MEMS caps)
df = df[df['study_month'] > 0]


'''Group by our desired epoch and add standard metrics such as mean, std for a given datetime column 

We want to predict weekly adherence, we we'll group by the study week

We could also swap this out for day or month
'''
groupby_cols = dataset.id_cols + ['study_week']

temporal_feats = features.calc_standard_temporal_metrics(df, groupby_cols, 'datetime')

''' Calculate adherence-related metrics 
    
    Recall that participants can have multiple observations per day, and that binary indicator columns
      ending in '_today' (e.g., 'adherent today') will be the same for each observation
      
    So we should do an extra step here to ensure we aren't counting metrics multiple times per day...
'''
df2 = df.groupby(groupby_cols + ['study_day'])['used_today'].max().reset_index() # Max will be 1 or 0
df2 = df2.groupby(groupby_cols)['used_today'].agg({
    'usage_rate': lambda x: x.sum() / consts.DAYS_IN_WEEK
}).reset_index()
temporal_feats = temporal_feats.merge(df2, on=groupby_cols)

df2 = df.groupby(groupby_cols + ['study_day'])['adherent_today'].max().reset_index() # Max will be 1 or 0
df2 = df2.groupby(groupby_cols)['adherent_today'].agg({
    'adherence_rate': lambda x: x.sum() / consts.DAYS_IN_WEEK
}).reset_index()
temporal_feats = temporal_feats.merge(df2, on=groupby_cols)

temporal_feats


## All Features

In [None]:
feats_to_extract = feat_cols.copy()
feats_to_extract.pop('other')
static_feats = dataset.build_df_from_features(feats_to_extract)

all_feats = temporal_feats.merge(static_feats, on=dataset.id_cols)
all_feats

## Imputation & One-Hot Encoding

### Categoricals

In [None]:
# Mode imputation
cat_cols = list(consts.CODEBOOK.keys())
for col in cat_cols:
    all_feats[col].fillna(all_feats[col].mode()[0], inplace=True)

all_feats = pd.get_dummies(all_feats, columns=cat_cols) 
all_feats

### Numerics

In [None]:
numeric_cols = list(set(list(all_feats.select_dtypes('number').columns)) -\
                    set(dataset.id_cols + ['study_week']) -\
                    set(cat_cols) # Exclude categoricals
                   )
imputer = IterativeImputer(random_state=5)
all_feats[numeric_cols] = imputer.fit_transform(all_feats[numeric_cols])
all_feats

In [None]:
all_feats[['race_other']]

## Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
all_feats[numeric_cols] = scaler.fit_transform(all_feats[numeric_cols])
all_feats

In [None]:
'''
TODO - think about other features that are important (e.g., does month of enrollment matter?)
Add sliding window code and test
Figure out how to normalize heterogeneous vals
Ask Kristi how they got the days of use and percentage vals...
Figure out how to add in cross-validation and tuning, and upsampling in tensorflow
''' 

In [None]:
all_feats.isnull().values.any()

# Modeling
Taken from official Tensorflow tutorial on time series data
https://www.tensorflow.org/tutorials/structured_data/time_series

In [None]:
# Generate labels
all_feats['adherent'] = all_feats['adherence_rate'].apply(
    lambda x: 1 if x > consts.ADHERENCE_THRESHOLD else 0
)

all_feats.head(15)

In [None]:
column_indices = {name: i for i, name in enumerate(all_feats.columns)}

n = len(all_feats)
train_df = all_feats[0:int(n*0.7)]
val_df = all_feats[int(n*0.7):int(n*0.9)]
test_df = all_feats[int(n*0.9):]

num_features = df.shape[1]


In [None]:
w1 = features.WindowGenerator(input_width=24, label_width=1, shift=1,
                              train_df=train_df, val_df=val_df, test_df=test_df,
                              label_columns=['adherent'])
w1

In [None]:
# Stack three slices, the length of the total window:
example_window = tf.stack([np.array(train_df[:w1.total_window_size]),
                           np.array(train_df[100:100+w1.total_window_size]),
                           np.array(train_df[200:200+w1.total_window_size])])


example_inputs, example_labels = w2.split_window(example_window)

print('All shapes are: (batch, time, features)')
print(f'Window shape: {example_window.shape}')
print(f'Inputs shape: {example_inputs.shape}')
print(f'labels shape: {example_labels.shape}')


# Visualization

In [None]:
# '''
#  Categorize each participant according to their score(s) and 
#  add category to each observation
# '''
# score_props = {k + '_mean': {'bins': [-1, v['max_val'] / 3.0, 
#                                       2 * v['max_val'] / 3.0, 
#                                       v['max_val']],
#                              'labels': ['low', 'medium', 'high']
#                             }
#                             for k, v in consts.SCORES.items()}

# dataset = features.gen_categories(dataset, score_props)
# dataset.rename({k + '_mean_cat': k + '_cat' for k in consts.SCORES.keys()},
#                 axis=1,
#                 inplace=True
#               )
# for k in consts.SCORES.keys():
#     target_col = k + '_cat'
#     print(target_col)
#     id_cols = ['PtID', 'MemsNum']
#     mems_df = mems_df.merge(dataset[id_cols + [target_col]], on=id_cols)

# mems_df

In [None]:
# Makes no sense - fix
import seaborn as sns
sns.lineplot(x='study_month', y='adherent', hue='FACTB_cat', data=mems_df)

In [None]:
# Visualize adherence frequency
df = mems_df['PtID'].value_counts().reset_index(name='count')
df.rename(columns={'index': 'PtID'}, inplace=True)
df

dataset.hist(column='percentMEMS8')

# Skewed right - most people took their meds :O

In [None]:
mems_df[['PtID', 'MemsNum'] + feat_cols['demographics'] + 
        feat_cols['study_behavior'] + 
        list(itertools.chain(*[v for k,v in feat_cols['scores'].items()]))]


In [None]:
data[data['percentMEMS8']*100 > 85] # TODO: Check the lit for preferred threshold for adherence


# Possible research questions
- Can MEMS data (frequency, times of day, etc) be used to predict different types of wellbeing? (e.g., social wellbeing)
- Can demographic + wellbeing data at baseline (does it have to just be at baseline?) be used to predict long-term adherence?

- Worth looking at both of these?

- What are the most important determinants of adherence? wellbeing? demographics?
- What kind of phenotypes emerge?

In [None]:
# Get average scores for assessments that were administered at multiple timepoints
for k,v in scores.items():
    newcol = k + '_mean'
    
    # Filtering is necessary here since the FACT-B and FACT-G item-by-item scores are included in the dataset
    data[newcol] = data[feat_cols['scores'][k]].mean(axis=1)
    
    # Be sure to include this new "mean" column in our list of feature columns for this score
    feat_cols['scores'][k].append(newcol)

In [None]:
mean_scores_df = data[[col for col in data.columns if 'mean' in col]]

for col in mean_scores_df.columns:
    fig, ax = plt.subplots()
    s = mean_scores_df[col]
    print(scipy.stats.describe(s))
    print('25th Percentile: ' + str(np.percentile(s, 25)))
    print('75th Percentile: ' + str(np.percentile(s, 75)))
    print('median: ' + str(s.median()))
    sns.distplot(s)
    plt.show()
    
# Most scores skewed toward higher quality of life. Use Median when dividing them up into 'high/low'


In [None]:
# TODO: Ask Kristi about any stat results they've already completed
df = mems_df['PtID'].value_counts().reset_index(name='n_mems_events')
df.rename(columns={'index': 'PtID'}, inplace=True)

df2 = data[['PtID'] + [col for col in data.columns if 'mean' in col]]
for col in df2.columns[1:]:
    df[col + '_group'] = df2[col].apply(lambda x: 'low' if x < df2[col].median() else 'high')

df = df.merge(df2, on='PtID')

group_cols = [col for col in df.columns if 'group' in col]

for col in group_cols:
    df3 = df.groupby([col])['n_mems_events'].mean().reset_index(name='avg_n_mems_events')
    print(df[col].value_counts())
    sns.barplot(y='avg_n_mems_events', x=col, data=df3)
    plt.show()

### Demographics

In [None]:
# Get a demographics dataframe with meaningful labels
dem_df = data[feat_cols['demographics']]
for col in dem_df.columns:
    dem_df[col] = dem_df[col].map(codebook[col])
    
dem_df

In [None]:
for col in dem_df.columns:

    df = dem_df.groupby(
        col
    ).size().reset_index(name='N').sort_values(by=['N'], ascending=False)
    df

    pie, ax = plt.subplots(figsize=[10,6])
    labels = df[col]
    plt.pie(x=df['N'], autopct="%.1f%%", labels=labels, pctdistance=0.5)
    plt.title(col.capitalize(), fontsize=14);
    # pie.savefig('results/figures/demographics_edu.png', bbox_inches="tight")

    df['percentage'] = round(100 * df['N'] / df['N'].sum(), 0)
    # df.to_csv('results/tables/demographics_edu.csv')
    df