In [None]:
"""
@author: abaglione and lihuacai

Credit to Tyler Spears and Sonia Baee, who developed the precursor
to this preprocessing script
"""

# imports
import sys
import os
import functools
import pathlib
import glob
import collections
import itertools
import re
import random
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

import numpy as np
import pandas as pd
import copy
import pipeline

from sklearn import impute
from sklearn import datasets
from sklearn import svm, linear_model, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import scipy
from scipy.spatial.distance import cdist

# visualization libraries
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2

In [None]:
# Read in the weekly feature vectors 
wkly_df = pd.read_csv('features/all_ind_wkly.csv')

# Store the feature names
featnames = list(wkly_df.columns)

wkly_df

In [None]:
#imputing missing survey data -- weekly data
impfeats = ['cope_alcohol_tob', 'physical_pain', 'connected', 'receive_support', 'anx',
            'dep', 'active', 'support_others', 'healthy_food']
featnames_app = [i for i in featnames if i not in impfeats]
feats_app = wkly_df.drop(axis=1, columns=impfeats)
feats_napp = wkly_df[impfeats].copy()
pipeline.fillmissing(feats_napp, impfeats, -1, np.nan)

# I think Lee was imputing the app features and survey features separately...hmm
imputer = IterativeImputer(max_iter=50, random_state=1008, add_indicator=True)
imputer.fit(feats_napp)
impfeats_ind = [i+'_ind' for i in impfeats]
impfeats_c = copy.deepcopy(impfeats)
impfeats_c.extend(impfeats_ind)
feats_napp = pd.DataFrame(
    np.round(imputer.transform(feats_napp)), columns=impfeats_c)
all_feats = pd.concat([feats_app, feats_napp], copy=True, axis=1)
all_feats

In [None]:
frequency_feats = [n for n in featnames if 'frequency' in n]
reg_feats = [n for n in featnames if 'daysofuse' in n]
dur_feats = [n for n in featnames if 'duration' in n and 'betweenlaunch' not in n]
lau_dur_feats = [n for n in featnames if 'betweenlaunch' in n]

# Correction for missing data (change from -1 to 0 for some features)
pipeline.fillmissing(all_feats, frequency_feats, -1, 0)
pipeline.fillmissing(all_feats, reg_feats, -1, 0)
pipeline.fillmissing(all_feats, dur_feats, -1, 0)
pipeline.fillmissing(all_feats, lau_dur_feats, -1, 3600*24*7)

#add the intercept columns for the linear mixed model
all_feats['intercept'] = 1

#outcomes transformation -- anx, dep
#week to week change as outcome
#change to baseline level as outcome
#instead of difference, consider ratio between the weekly value and the baseline
#global average being subtracted

all_feats['anx'].hist()
all_feats['dep'].hist()

#add classification outcomes
all_feats['dep_cat'] = np.where(all_feats['dep'] >= 4, 1, 0)
all_feats['anx_cat'] = np.where(all_feats['anx'] >= 3, 1, 0)

In [None]:
# ------ Feature Set Spec -----------

APPS = ['aspire', 'boostme', 'dailyfeats', 'icope', 'mantra', 'messages',
        'moveme', 'relax', 'slumbertime', 'thoughtchallenger', 'worryknot']
ENGAGEMENT_METRICS = ['frequency', 'duration',
                      'betweenlaunch_duration', 'days_of_use']
TIMES_OF_DAY = ['morning', 'afternoon', 'evening', 'late_night']

# Survey Features Only
survey_fs_cols = ['cope_alcohol_tob', 'physical_pain', 'connected', 'receive_support', 'active',
                  'support_others', 'healthy_food', 'cope_alcohol_tob_ind', 'physical_pain_ind',
                  'connected_ind', 'receive_support_ind', 'active_ind', 'support_others_ind',
                  'healthy_food_ind']

# App Features - All Apps
app_overall_fs_cols = ['weekofstudy', 'frequency', 'daysofuse', 'duration', 'duration_mean',
                       'duration_std', 'duration_min', 'duration_max', 'betweenlaunch_duration_mean',
                       'betweenlaunch_duration_std', 'num_apps_used']

# App Features - Individual Apps
app_ind_fs_cols = ['weekofstudy'] + \
    [col for col in all_feats.columns
     if any([app in col for app in APPS])
     and any([metric in col for metric in ENGAGEMENT_METRICS])
     and not any([tod in col for tod in TIMES_OF_DAY])]

app_ind_fs_cols

In [None]:
# Add one last feature - an indicator of which app was used most often
df = all_feats[[i for i in app_ind_fs_cols if 'frequency' in i]].copy()
all_feats['most_used_app'] = [i[1] for i in df.idxmax(axis=1).str.split('_')]

#dummitize the most_used_app column
mua_dummy_df = pd.get_dummies(all_feats['most_used_app'])
mua_dummy_cols = ['most_used_app_' + c for c in mua_dummy_df.columns]
all_feats = pd.concat([all_feats,mua_dummy_df],axis=1)

# Create a subset with survey features + only features from the most used app
mua_dfs = []

def rename_mapper(colname):
    new_colname = colname.replace('_'+all_feats['most_used_app'][i], '')
    return(new_colname)

for i in range(all_feats.shape[0]):
    df = all_feats[[
        e for e in app_ind_fs_cols if all_feats['most_used_app'][i] in e]].iloc[[i]].copy()
    df.rename(mapper=rename_mapper, axis=1, inplace=True)
    mua_dfs.append(df)

app_mua_feats = pd.concat(mua_dfs, sort=False)
survey_app_mua_feats = pd.concat(
    [all_feats[['pid', 'weekofstudy', 'anx', 'dep','anx_cat', 'dep_cat', 'most_used_app']],
     all_feats[survey_fs_cols],
     app_mua_feats], 
    axis=1, copy=True
)

survey_app_mua_feats

In [None]:
# Create last featureset
# App Features - Only Features from the Most Used App for a Given Observation (Row)
app_mua_fs_cols = ['weekofstudy', 'frequency', 'daysofuse', 
                   'duration', 'duration_mean', 'duration_std',
                   'duration_min', 'duration_max', 'betweenlaunch_duration_mean', 
                   'betweenlaunch_duration_std']

# Add new dummy columns to other featuresets
app_overall_fs_cols += mua_dummy_cols
app_ind_fs_cols += mua_dummy_cols

app_ind_fs_cols

In [None]:
# Create dictionary of featuresets
featuresets = {
    'survey_fs': survey_fs_cols,
    'app_overall_fs': app_overall_fs_cols,
    'app_ind_fs': app_ind_fs_cols,
    'app_mua_fs': app_mua_fs_cols,
    'survey_app_overall_fs': survey_fs_cols+app_overall_fs_cols, 
    'survey_app_ind_fs': survey_fs_cols+app_ind_fs_cols,
    'survey_app_mua_fs': survey_fs_cols+app_mua_fs_cols
}
featuresets

In [None]:
######regression tasks on 1-5 scale (cut off on both 1 (floor) and 5 (ceiling)) using lasso linear mixed effect model;
alpha_list = np.arange(0.1, 0.81, 0.1)
lmm_res = []

for alpha in alpha_list:
    print('alpha: {0}'.format(alpha))
    for fs_name, fs_cols in featuresets.items():
        if 'mua' in fs_name:
            df = survey_app_mua_feats
        else:
            df = all_feats
        
        df[['intercept'] + fs_cols].to_csv('features/%s.csv' % fs_name)
        for target in ['anx', 'dep']:
            res = pipeline.genMixedLM(df, target, ['intercept'] + fs_cols,
                            'pid', fs_name, alpha=alpha)
            lmm_res.append(res.copy())

lmm_res = pd.concat(lmm_res, copy=True, ignore_index=True, sort=False)
lmm_res.to_csv('results/lmm_res.csv', index=False)

In [None]:
# Prediction
id_col = 'pid'
target_cols = ['anx_cat', 'dep_cat']

for fs_name, fs_cols in featuresets.items():

    if 'mua' not in fs_name:
        df = all_feats
    else:
        # Handle special cases in which we want data only from the most used app
        df = survey_app_mua_feats

    X = df[[id_col] + fs_cols].copy()
    
    ''' If this is a featureset with app features 
        Get a list of one-hot-encoded columns from the most_used_app feature.'''
    mua_onehots = [col for col in X.columns if 'most_used_app' in col]
    
    print(X.columns)
    # Get categorical feature indices - will be used with SMOTENC later
    nominal_idx = sorted([X.columns.get_loc(c) for c in ['pid'] + mua_onehots])

    # y
    targets = {
        'anxiety': df['anx_cat'].copy(),
        'depression': df['dep_cat'].copy()
    }

    for target_name, target_col in targets.items():
        for method in ['RF', 'XGB']:
            res = pipeline.classifyMood(X=X, y=target_col, id=id_col, target=target_name,
                              nominal_idx = nominal_idx, fs=fs_name, method=method)
