# Breast Cancer Project

## Setup

In [None]:
# Credit to Tyler Spears and Sonia Baee, who started the original pipeline code (eg. reading in files)

# imports
import sys
import os
import functools
import pathlib
import glob
import collections
import itertools
import re
import random
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

import numpy as np
import pandas as pd
import pipeline

from sklearn import datasets
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import scipy
from scipy.spatial.distance import cdist

# visualization libraries
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2
    

In [None]:
master_dataset = pipeline.import_processed_files()
master_featureset = pipeline.create_file_dictionary('features')['features']

### Demographic & Mood Features 

In [None]:
# Initialize featureset to be the Pre- and Post- measures
features = master_dataset['pre_post']
features.drop('dataset', axis=1, inplace=True)

# Add helper columns denoting who has app data and who dropped out
app_users = list(master_dataset['app_launch']['pid'].unique())
features['has_app_data'] = features['pid'].apply(lambda x: x in app_users)
features['dropped'] = [pipeline.match_value(master_dataset['dropouts'], 'pid', pid, 'dropped') for pid in features['pid']]
features

#### Baseline PHQ-4 & PROMISE Scores

In [None]:
# Denote that the baseline was the PHQ4, specifically
features.rename(index=str, columns={'phq_bl': 'phq4_bl', 'phq_post': 'phq4_post'}, inplace=True)
# features.drop(columns=['phq4'], inplace=True)

# Create new columns for phq4 subscales
df = master_dataset['blsurvey']

for subscale, descriptors in pipeline.PHQ4_SCORING['subscales'].items():
    cols = descriptors['cols']
    for col in cols:
        df[col] = df[col].map(descriptors['codes'])
    
    new_col = 'phq4_' + str(subscale) + '_bl'
    df[new_col] = df.apply(lambda row: row[cols].sum(),axis=1)
    temp = df.loc[:, ['pid', new_col]]
    features = features.set_index('pid').join(temp.set_index('pid')).reset_index()

    
# Divide users into baseline 'low' or 'high' depressed and 'low' or 'high' anxious groups,
# based on PHQ4 scoring and PROMIS t-score cut points

features['trait_dep_group'] = np.where((features['phq4_depression_bl'] >= pipeline.PHQ4_THRESH) | (features['promis_dep_bl'] >= pipeline.PROMIS_THRESH), 'high', 'low')
features['trait_anx_group'] = np.where((features['phq4_anxiety_bl'] >= pipeline.PHQ4_THRESH) | (features['promis_anx_bl'] >= pipeline.PROMIS_THRESH), 'high', 'low')

features.head(25)

#### Weekly Mood Scores

In [None]:
# Associate weekly survey responses with weeks of the study
# df = master_dataset['wklysurvey']
# df['enddate'] = pd.to_datetime(df['enddate'])

timestamps = master_dataset['wklysurvey_timestamps']
timestamps['date'] = pd.to_datetime(timestamps['date'])

# df['weekofstudy'] = [pipeline.find_week_by_timestamp(master_dataset['wklysurvey_timestamps'], 
#                                                      row['pid'], row['enddate']) 
#                      for index, row in df.iterrows()]
# pd.set_option('display.max_rows', 180)

# df.to_csv('data/processed/wklysurvey_processed.csv')
# df[['pid', 'startdate', 'enddate','weekofstudy']]

# IMPORTANT: If NaNs in weekofstudy col, fix manually and re-import dataset!

In [None]:
# # Run only after manual correction, if needed
# master_dataset['wklysurvey'] = pd.read_csv('data/processed/wklysurvey_processed.csv')
master_dataset['wklysurvey'][['pid', 'startdate', 'enddate','weekofstudy']]

In [None]:
# Store response rates for the weekly surveys, for later analysis
df = timestamps.groupby('method')['method'].count().reset_index(name='num_users')
# reminder_types

# df = timestamps.groupby('method')
response_rates = timestamps.groupby('method')['completed'].sum() / timestamps.groupby('method')['sent'].sum()
response_rates = response_rates.reset_index(name="response_rate")

df = df.merge(response_rates, on="method")
df.to_csv('features/response_rates.csv')
df
# res.dtypes

In [None]:
# Construct weekly mood scores (anxiety and depression)

mood = {}
for c in ['anx', 'dep']:
    
    # Extract weekly mood score
    df = master_dataset['wklysurvey']

    for i in range(2, 8):
        mood_df = df.loc[df['weekofstudy'] == i][['pid', c]]
        mood_df.columns = ['pid', 'w' + str(i) + '_' + c]
        mood[i] = mood_df

    for week, mood_df in mood.items():
        features = features.set_index('pid').join(mood_df.set_index('pid')).reset_index()

    # Add the week1 score (mean of 7 total daily mood scores)
    df = master_dataset['fwsurveys']
    df = df.groupby('pid')[c].mean().round(0).reset_index(name='w1' +  '_' + c)
    features = features.merge(df, on="pid")   
    
features

In [None]:
# Save to CSV
features.to_csv('features/' + pipeline.ALL_USERS_DIR + 'pre_post.csv')

### App Usage Features

Before we compute features related to app usage, we should tidy up the app_launch data table.

In [None]:
app_launch = master_dataset['app_launch']

# Eliminate launches with duration < 5 seconds
app_launch = app_launch[app_launch['duration'] > 0]

app_launch['date'] = pd.to_datetime(app_launch['date'], errors='coerce')
app_launch['day'] = app_launch['date'].dt.day
app_launch['week'] = app_launch['date'].dt.week
app_launch['hour'] = app_launch['date'].dt.hour

# Identify the participant's start date
app_launch['startdate'] = [pipeline.match_value(master_dataset['blsurvey'], 'pid', x, 'startdate') for x in app_launch['pid']]

# Identify epoch (6 hour time window) during which each app launch occured
app_launch['epoch'] = pd.cut(app_launch['hour'], pipeline.EPOCHS['bins'], labels=pipeline.EPOCHS['labels'])

# Determine the day and week of study, for each observation, 
# based on the participant's start date
app_launch['timeelapsed'] = pd.to_datetime(app_launch['date']) - pd.to_datetime(app_launch['startdate'])
app_launch['weekofstudy'] = np.ceil(app_launch['timeelapsed'].dt.days / 7.0)
app_launch['weekofstudy'] = app_launch['weekofstudy'].astype(int)
app_launch['dayofstudy'] = np.ceil(app_launch['timeelapsed'].dt.days)
app_launch['dayofweek'] = app_launch['date'].dt.dayofweek

# Keep only the data from the weeks of the study
app_launch = app_launch[(1 <= app_launch['weekofstudy']) & (app_launch['weekofstudy'] <= 7)]

# Extract list of apps
apps = list(app_launch['package'].unique())
app_launch

In [None]:
# Save out to csv, to save time later
app_launch.to_csv('data/processed/app_launch_processed.csv')

#### Weekly

In [None]:
wkly_moods = master_dataset['wklysurvey'][['pid', 'weekofstudy', 'anx', 'dep']]

def add_affect(df, features, timediv, wkly_affect_df=None) :
        
    if timediv == 'wkly':
        # Record wkly affect by week
        df = pd.merge(df, wkly_affect_df, on=['pid', 'weekofstudy'], how="left")
    
    # Record trait affect groups for anxiety and depression
    for group in pipeline.TRAIT_AFFECT_GROUPS.keys():
        df[group] = df['pid'].apply(lambda x:  pipeline.match_value(features, 'pid', x, group))
        
    return df

In [None]:
# Applevel
ind_applevel = app_launch.groupby(['pid','package'])['weekofstudy'].value_counts().reset_index(name='frequency')
df = app_launch.groupby(['pid','package', 'weekofstudy'])['dayofweek'].nunique().reset_index(name='daysofuse')

ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = list(df.columns)[:-1],
    how="outer"
)

df = pipeline.calc_duration_noepoch(app_launch, groupbycols = ['pid','package','weekofstudy'])
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = list(df.columns)[:-7],
    how="outer"
)

# Aggregate
ind_agg = app_launch.groupby('pid')['weekofstudy'].value_counts().reset_index(name='frequency')

df = app_launch.groupby(['pid','weekofstudy'])['dayofweek'].nunique().reset_index(name='daysofuse')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = list(df.columns)[:-1],
    how="outer"
)

df = pipeline.calc_duration_noepoch(app_launch, groupbycols = ['pid','weekofstudy'])
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = list(df.columns)[:-7],
    how="outer"
)

ind_applevel = add_affect(ind_applevel.round(0), features, 'wkly', wkly_moods)
ind_agg =  add_affect(ind_agg.round(0), features, 'wkly', wkly_moods)

ind_applevel.to_csv('features/' + pipeline.APP_USERS_DIR + 'wkly_applevel.csv')
ind_agg.to_csv('features/' + pipeline.APP_USERS_DIR + 'wkly_agg.csv')


#### Weekly x Time of Day
**IMPORTANT NOTE:** Throughout the code, I've referred to the time of day as the "epoch". However, note that a week is also considered an epoch, by our own definition in the paper. Probably need to change this terminology, at some point. - Anna

In [None]:
merge_cols=['pid', 'package', 'weekofstudy']

# Applevel
df = app_launch.groupby(['pid', 'package', 'weekofstudy'])['epoch'].value_counts()
ind_applevel = pipeline.weekly_epoch_breakdown(df, 'frequency', merge_cols)

df = app_launch.groupby(['pid','package','weekofstudy','epoch'])['dayofweek'].nunique()
df = pipeline.weekly_epoch_breakdown(df, 'daysofuse', merge_cols)
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = merge_cols,
    how="outer"
)

df = pipeline.calc_duration_has_epoch(app_launch, groupbycols = ['pid','package','weekofstudy','epoch'])
df
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = merge_cols,
    how="outer"
)

# Aggregate
merge_cols=['pid', 'weekofstudy']

df = app_launch.groupby(['pid', 'weekofstudy'])['epoch'].value_counts()
ind_agg = pipeline.weekly_epoch_breakdown(df, 'frequency', merge_cols)

df = app_launch.groupby(['pid','weekofstudy','epoch'])['dayofweek'].nunique()
df = pipeline.weekly_epoch_breakdown(df, 'daysofuse', merge_cols)
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = ['pid','weekofstudy'],
    how="outer"
)

df = pipeline.calc_duration_has_epoch(app_launch, groupbycols = ['pid','weekofstudy','epoch'])
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = merge_cols,
    how="outer"
)

ind_applevel = add_affect(ind_applevel.round(0), features, 'wkly', wkly_moods)
ind_agg =  add_affect(ind_agg.round(0), features, 'wkly', wkly_moods)

ind_applevel.to_csv('features/' + pipeline.APP_USERS_DIR + 'wkly_epoch_applevel.csv')
ind_agg.to_csv('features/' + pipeline.APP_USERS_DIR + 'wkly_epoch_agg.csv')

#### Weekly x Time of Day (Long Form?)

In [None]:
groupbycols=['pid', 'package', 'weekofstudy', 'epoch']

# Applevel
ind_applevel = app_launch.groupby(
    groupbycols[:-1]
)['epoch'].value_counts().reset_index(name="frequency")

ind_applevel

df = app_launch.groupby(groupbycols)['dayofweek'].nunique().reset_index(name='daysofuse')
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = groupbycols,
    how="outer"
)

df = app_launch.groupby(groupbycols)['duration'].sum().reset_index(name='duration')
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = groupbycols,
    how="outer"
)

# Aggregate
groupbycols=['pid', 'weekofstudy', 'epoch']


# Applevel
ind_agg = app_launch.groupby(
    groupbycols[:-1]
)['epoch'].value_counts().reset_index(name="frequency")


df = app_launch.groupby(groupbycols)['dayofweek'].nunique().reset_index(name='daysofuse')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = groupbycols,
    how="outer"
)

df = app_launch.groupby(groupbycols)['duration'].sum().reset_index(name='duration')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = groupbycols,
    how="outer"
)

ind_applevel = add_affect(ind_applevel.round(0), features, 'wkly', wkly_moods)
ind_agg =  add_affect(ind_agg.round(0), features, 'wkly', wkly_moods)
                
ind_applevel.to_csv('features/' + pipeline.APP_USERS_DIR + 'wkly_epoch_applevel_lf.csv')
ind_agg.to_csv('features/' + pipeline.APP_USERS_DIR + 'wkly_epoch_agg_lf.csv')

#### Entire Study

In [None]:
# Applevel
ind_applevel = app_launch.groupby('pid')['package'].value_counts().reset_index(name='frequency')
df = app_launch.groupby(['pid','package'])['dayofstudy'].nunique().reset_index(name='daysofuse')

df = pipeline.calc_duration_noepoch(app_launch, groupbycols = ['pid','package'])
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = list(df.columns)[:-7],
    how="outer"
)

# Aggregate
ind_agg = app_launch['pid'].value_counts().reset_index(name="frequency")
ind_agg.rename(columns={'index': 'pid'}, inplace=True)

df = app_launch.groupby('pid')['dayofstudy'].nunique().reset_index(name='daysofuse')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = list(df.columns)[:-1],
    how="outer"
)

df = pipeline.calc_duration_noepoch(app_launch, groupbycols = ['pid'])
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = list(df.columns)[:-7],
    how="outer"
)

ind_applevel = ind_applevel.round(0)
ind_agg = ind_agg.round(0)


ind_applevel = add_affect(ind_applevel.round(0), features, 'study')
ind_agg =  add_affect(ind_agg.round(0), features, 'study')
                
ind_applevel.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_applevel.csv')
ind_agg.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_agg.csv')


#### Entire Study (Long form?)

In [None]:
groupbycols=['pid', 'package']

# Applevel
ind_applevel = app_launch.groupby(
    groupbycols[:-1]
)[groupbycols[-1]].value_counts().reset_index(name="frequency")


df = app_launch.groupby(groupbycols)['dayofstudy'].nunique().reset_index(name='daysofuse')
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = groupbycols,
    how="outer"
)

df = app_launch.groupby(groupbycols)['duration'].sum().reset_index(name='duration')
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = groupbycols,
    how="outer"
)

# Aggregate
groupbycols=['pid']

ind_agg = app_launch['pid'].value_counts().reset_index(name='frequency')
ind_agg.rename(columns={'index':'pid'}, inplace=True)

df = app_launch.groupby(groupbycols)['dayofstudy'].nunique().reset_index(name='daysofuse')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = groupbycols,
    how="outer"
)

df = app_launch.groupby(groupbycols)['duration'].sum().reset_index(name='duration')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = groupbycols,
    how="outer"
)

ind_applevel = add_affect(ind_applevel.round(0), features, 'study')
ind_agg =  add_affect(ind_agg.round(0), features, 'study')
                
ind_applevel.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_applevel_lf.csv')
ind_agg.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_agg_lf.csv')

#### Study x Time of Day

In [None]:
merge_cols=['pid', 'package']

# Applevel
df = app_launch.groupby(['pid', 'package'])['epoch'].value_counts()
ind_applevel = pipeline.weekly_epoch_breakdown(df, 'frequency', merge_cols)

df = app_launch.groupby(['pid','package','epoch'])['dayofstudy'].nunique()
df = pipeline.weekly_epoch_breakdown(df, 'daysofuse', merge_cols)
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = merge_cols,
    how="outer"
)

df = pipeline.calc_duration_has_epoch(app_launch, groupbycols = ['pid','package','epoch'])
df
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = merge_cols,
    how="outer"
)

# Aggregate
merge_cols=['pid']

df = app_launch.groupby('pid')['epoch'].value_counts()
ind_agg = pipeline.weekly_epoch_breakdown(df, 'frequency', merge_cols)

df = app_launch.groupby(['pid','epoch'])['dayofstudy'].nunique()
df = pipeline.weekly_epoch_breakdown(df, 'daysofuse', merge_cols)
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = ['pid'],
    how="outer"
)

df = pipeline.calc_duration_has_epoch(app_launch, groupbycols = ['pid','epoch'])
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = merge_cols,
    how="outer"
)

ind_applevel = ind_applevel.round(0)
ind_agg = ind_agg.round(0)


ind_applevel = add_affect(ind_applevel.round(0), features, 'study')
ind_agg =  add_affect(ind_agg.round(0), features, 'study')
                
ind_applevel.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_epoch_applevel.csv')
ind_agg.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_epoch_agg.csv')


In [None]:
ind_agg

#### Study x Time of Day (Long Form?)

In [None]:
groupbycols=['pid', 'package', 'epoch']

# Applevel
ind_applevel = app_launch.groupby(groupbycols[:-1])[groupbycols[-1]].value_counts().reset_index(name="frequency")

df = app_launch.groupby(groupbycols)['dayofstudy'].nunique().reset_index(name='daysofuse')
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = groupbycols,
    how="outer"
)

df = app_launch.groupby(groupbycols)['duration'].sum().reset_index(name='duration')
ind_applevel = pd.merge(
    ind_applevel, 
    df, 
    on = groupbycols,
    how="outer"
)

# Aggregate
groupbycols=['pid', 'epoch']


# Applevel
ind_agg = app_launch.groupby(groupbycols[:-1])[groupbycols[-1]].value_counts().reset_index(name="frequency")


df = app_launch.groupby(groupbycols)['dayofstudy'].nunique().reset_index(name='daysofuse')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = groupbycols,
    how="outer"
)

df = app_launch.groupby(groupbycols)['duration'].sum().reset_index(name='duration')
ind_agg = pd.merge(
    ind_agg, 
    df, 
    on = groupbycols,
    how="outer"
)

ind_applevel = add_affect(ind_applevel.round(0), features, 'study')
ind_agg =  add_affect(ind_agg.round(0), features, 'study')
                
ind_applevel.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_epoch_applevel_lf.csv')
ind_agg.to_csv('features/' + pipeline.APP_USERS_DIR + 'study_epoch_agg_lf.csv')

In [None]:
ind_applevel.head()

In [None]:
master_dataset.keys()

### Construct Weekly Feature Vectors

In [None]:
wkly_vectors = pipeline.construct_feature_vectors(app_launch, master_dataset['wklysurvey'], 'wkly')
wkly_vectors

In [None]:
wkly_vectors.to_csv('features/' + pipeline.APP_USERS_DIR + '/all_ind_wkly.csv')

### Entire Study Stats (Added 9/8 as Additional Feature Gen)

In [None]:
stats = pd.DataFrame()
features = pd.read_csv('features/' + pipeline.APP_USERS_DIR + '/study_agg.csv')
cols = ['frequency', 'daysofuse', 'duration']

for trait_group, trait_group_label in pipeline.TRAIT_AFFECT_GROUPS.items():
    for col in cols:    
        to_merge = []
        
        if col == 'duration':
            features[col] = features[col] / 60.0
        
        df = features.groupby(trait_group)[col].mean().reset_index(name='mean')
        
        df2 = features.groupby(trait_group)[col].std().reset_index(name='std')
        to_merge.append(df2)

        df2 = features.groupby(trait_group)[col].var().reset_index(name='var')
        to_merge.append(df2)
        
        for df_to_merge in to_merge:
            df = pd.merge(df, df_to_merge, on=trait_group, how="outer")
            
        df.rename(columns={trait_group: 'trait_group'}, inplace=True)
        df['trait_group'] = df['trait_group'].apply(lambda x: x.capitalize())
        
        df.insert(0, 'trait_affect', np.nan)
        if 'anx' in trait_group:
            df['trait_affect'] = 'Anxiety'
        else:
            df['trait_affect'] = 'Depression'

        df.insert(2, 'measure', np.nan)
        df['measure'] = col
        df['measure'] = df['measure'].apply(
            lambda x: pipeline.WKLY_AFFECT[x] if x in pipeline.WKLY_AFFECT.keys() else x
        )
        
        if stats.empty:
            stats = df
        else:
            stats = stats.append(df, ignore_index=True)
stats

In [None]:
stats.to_csv('features/' + pipeline.APP_USERS_DIR + '/trait_study_stats.csv')

### Trait Affect x Wkly Stats (Added 9/8 as Additional Feature Gen)

In [None]:
stats = pd.DataFrame()
features = pd.read_csv('features/wkly_agg.csv')
cols = list(pipeline.WKLY_AFFECT.keys()) + ['frequency', 'daysofuse', 'duration']

for trait_group, trait_group_label in pipeline.TRAIT_AFFECT_GROUPS.items():
    for col in cols:    
        to_merge = []
        
        if col == 'duration':
            features[col] = features[col] / 60.0
        
        df = features.groupby(trait_group)[col].mean().reset_index(name='mean')
        
        df2 = features.groupby(trait_group)[col].std().reset_index(name='std')
        to_merge.append(df2)

        df2 = features.groupby(trait_group)[col].var().reset_index(name='var')
        to_merge.append(df2)
        
        for df_to_merge in to_merge:
            df = pd.merge(df, df_to_merge, on=trait_group, how="outer")
            
        df.rename(columns={trait_group: 'trait_group'}, inplace=True)
        df['trait_group'] = df['trait_group'].apply(lambda x: x.capitalize())
        
        df.insert(0, 'trait_affect', np.nan)
        if 'anx' in trait_group:
            df['trait_affect'] = 'Anxiety'
        else:
            df['trait_affect'] = 'Depression'

        df.insert(2, 'measure', np.nan)
        df['measure'] = col
        df['measure'] = df['measure'].apply(
            lambda x: pipeline.WKLY_AFFECT[x] if x in pipeline.WKLY_AFFECT.keys() else x
        )
        
        if stats.empty:
            stats = df
        else:
            stats = stats.append(df, ignore_index=True)
stats

In [None]:
stats.to_csv('features' + pipeline.APP_USERS_DIR + '/trait_wkly_stats.csv')

In [None]:
df = master_dataset['wklysurvey'][['pid','weekofstudy','anx', 'dep']]
df = add_affect(df, features, 'study')
df

In [None]:
df.to_csv('features/all_users/wkly_trait_state.csv')