# Exploratory Analysis

In [None]:
# imports
import sys
import os
import functools
import pathlib
import glob
import collections
import itertools
import re
import random
import math
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

# from entropy import *
import numpy as np
import pandas as pd
import pipeline

from sklearn import impute
from sklearn import datasets
from sklearn import svm, linear_model, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import abline_plot

import scipy
from scipy.spatial.distance import cdist
from scipy import stats
from scipy.stats import pearsonr

import shap 

# visualization libraries
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
from matplotlib.cbook import boxplot_stats
import matplotlib.dates as mdates
import matplotlib.transforms as mtrans
import seaborn as sns
sns.set_style("whitegrid")

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
# plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2

In [None]:
# Another precommit hook test
# Read in data
master_dataset = pipeline.import_processed_files()
master_featureset = pipeline.create_file_dictionary('features')['features']

# Obtain baseline features
features_bl = master_featureset[pipeline.ALL_USERS_DIR]['pre_post']
features_bl

Establish common vars

In [None]:
# Ensure epoch labels align with their correct columns
epoch_bins = pipeline.EPOCHS['labels'][1:] + [pipeline.EPOCHS['labels'][0]]
epoch_bin_labels = [label.capitalize().replace("_", " ") for label in epoch_bins]

app_launch = master_dataset['app_launch']

# Force convert the date column, since pandas seems to struggle with date
#   parsing for this project
app_launch['date'] = pd.to_datetime(app_launch['date'])

# Get list of number of users per app, for convenience
apps_nusers = app_launch.groupby('package')['pid'].nunique().reset_index(name="num_users").sort_values(by='num_users', ascending=False)
# df.to_csv("features/app_usercounts.csv")
apps_nusers

# apps = apps_nusers[apps_nusers['num_users'] > 6]
apps = list(apps_nusers['package'].unique())

## Feature Importance Graphs (Supplementary Material)

### Anxiety

In [None]:
# XGB x FS4, in paper
shap_scores = pickle.load(open('results/pred/feature_importance/shap_XGB_anxiety_sr_app_mua_fs.ob', 'rb'))
X_test = pd.read_pickle('results/pred/feature_importance/X_test_XGB_anxiety_sr_app_mua_fs.ob')
X_test.columns = [x.replace('_', ' ').capitalize() for x in X_test.columns]
shap.summary_plot(shap_scores, X_test, show=False)
fig = plt.gcf()
fig.set_size_inches(12.5, 8.5)
# txt="I need the caption to be present a little below X-axis"
# plt.figtext(0.53, -0.01, txt, wrap=True, horizontalalignment='center', fontsize=14)
plt.savefig('results/pred/feature_importance/anx.png')
plt.show()

### Depression

In [None]:
# RF x FS3, in paper
shap_scores = pickle.load(open('results/pred/feature_importance/shap_RF_depression_sr_app_overall_fs.ob', 'rb'))
X_test = pd.read_pickle('results/pred/feature_importance/X_test_RF_depression_sr_app_overall_fs.ob')
X_test.columns = [x.replace('_', ' ').capitalize() for x in X_test.columns]
shap.summary_plot(shap_scores[1], X_test, show=False)
fig = plt.gcf()
fig.set_size_inches(12.5, 8.5)
plt.savefig('results/pred/feature_importance/dep.png')
plt.show()

## Stat

In [None]:
df = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg']
df

### T-test?

In [None]:
# Too sparse for t-test as-is. Let Lee know.

df = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg']

# Quick look at distributions
low_df = df[df['trait_anx_group'] == 'low']
high_df = df[df['trait_anx_group'] == 'high']


plt.hist(low_df['anx'])
plt.hist(high_df['anx'])

In [None]:
scipy.stats.ttest_ind(low_df['frequency'], high_df['frequency'])

In [None]:
for group, group_label in pipeline.TRAIT_AFFECT_GROUPS.items():
    low_df = df[df[group] == 'low']
    high_df = df[df[group] == 'high']

    for metric in pipeline.FILENAME_MAPPINGS['metric'].keys():
        low_metric = low_df[metric]
        high_metric = high_df[metric]
        print('Group: ' + group_label + ', Metric: ' + metric)
        print('Result: ' + str(scipy.stats.ttest_ind(low_metric, high_metric)))

## Correlation Heatmaps 

In [None]:
master_featureset[pipeline.ALL_USERS_DIR]['study_agg']

In [None]:
sns.set(font_scale=3)
feats_allmetrics = master_featureset[pipeline.ALL_USERS_DIR]['study_agg']

# Add pipeline.TRAIT_AFFECT_SUBSCALES.keys()   (scores) to feats_allmetrics
for score in pipeline.TRAIT_AFFECT_SUBSCALES.keys()  :
    feats_allmetrics[score] = [pipeline.match_value(features_bl, 'pid', pid, score) for pid in feats_allmetrics['pid']]

fig, ax = plt.subplots(figsize=(20, 10)) 
cbar_ax = fig.add_axes([1.01, .3, .03, .4])
 
fig.suptitle('Trait Affect Correlation with Engagement Metrics', 
             y=1.05, 
             fontsize=50)

df = feats_allmetrics[list(pipeline.FILENAME_MAPPINGS['metric'].keys()) + 
                      list(pipeline.TRAIT_AFFECT_SUBSCALES.keys()  )]

# Correlate the baseline scores (pipeline.TRAIT_AFFECT_SUBSCALES.keys()  ) with app-specific features
#  (e.g., correlate PHQ4 baseline depression with frequency for the 
#   "Worrknot" app)
res = df.corr().round(decimals=2)

# Identify significant correlations, and create a mask so that we only display those
p_values = pipeline.corr_sig(res)
mask = np.invert(np.tril(p_values<0.05))

# Construct corr plot as seaborn heatmap

# if metric == 'daysofuse':
#     title = 'Days of Use'


sns.heatmap(res, 
            vmax=1, vmin=-1, 
            cmap='YlGnBu', 
            annot=True,
            ax=ax,
            mask=mask,
            cbar=True)

# Tidy up the plot

xlabels = ax.xaxis.get_majorticklabels()
ylabels = ax.yaxis.get_majorticklabels()

plt.setp(xlabels, rotation=45,fontsize=34)
plt.setp(ylabels, rotation=0)

fig.text(0.52, 0, "Trait Affect Subscale Score", ha='center')
fig.text(-0.01, 0.5, 'Epoch', va='center', rotation='vertical')

plt.savefig('results/cor/cor_study.png', bbox_inches = "tight")
plt.show()

### Weekly x Epoch and Study x Epoch Trait Affect x Engagement

In [None]:
sns.set(font_scale=3)
for timediv in ['wkly', 'study']:

    # Get all individual epoch/applevel features
    feats_allmetrics = master_featureset[timediv + '_epoch_agg']

    # Add pipeline.TRAIT_AFFECT_SUBSCALES.keys()   (scores) to feats_allmetrics
    for score in pipeline.TRAIT_AFFECT_SUBSCALES.keys()  :
        feats_allmetrics[score] = [pipeline.match_value(features_bl, 'pid', pid, score) for pid in feats_allmetrics['pid']]

    fig, axs = plt.subplots(1,3, sharex=True, sharey=False, figsize=(70, 25)) 
    cbar_ax = fig.add_axes([1.01, .3, .03, .4])

    title = 'Trait Affect Correlation with'
    if timediv == 'wkly':
        title = title + ' Weekly'
    title = title + ' Metrics by Epoch'
    if timediv == 'study':
        title = title + ''
    fig.suptitle(title, 
                 y=1.05, 
                 fontsize=50)

    for i, ax in enumerate(axs.flat):
        metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[i]

        # Find the abbreviation for the given metric (e.g. 'frequency' is denoted
        #   with 'loy')
        metric_abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]

        # Get the subset of features for the current metric
        df = feats_allmetrics[[col for col in feats_allmetrics.columns if metric in col] + list(pipeline.TRAIT_AFFECT_SUBSCALES.keys()  )]
        
        if metric == "duration":
            df = df[list(df.columns)[0:5] + list(pipeline.TRAIT_AFFECT_SUBSCALES.keys()  )]

        # Correlate the baseline scores (pipeline.TRAIT_AFFECT_SUBSCALES.keys()  ) with app-specific features
        #  (e.g., correlate PHQ4 baseline depression with frequency for the 
        #   "Worrknot" app)
        res = df.corr().round(decimals=2)

        # Identify significant correlations, and create a mask so that we only display those
        p_values = pipeline.corr_sig(res)
        mask = np.invert(np.tril(p_values<0.05))
        
        # Construct corr plot as seaborn heatmap
                        
        title =  metric.capitalize()
        if metric == 'daysofuse':
            title = 'Days of Use'

        ax.set_title(title, fontsize='42')
        ax.set(xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')', 
               ylabel='')

        sns.heatmap(res, 
                    vmax=1, vmin=-1, 
                    cmap='YlGnBu', 
                    annot=True, 
#                     xticklabels=pipeline.TRAIT_AFFECT_SUBSCALES.values(),
#                     yticklabels=epoch_bin_labels,
                    ax=ax,
                    mask=mask,
                    cbar=(i==0),
                    cbar_ax=None if i else cbar_ax)

        # Tidy up the plot

        xlabels = ax.xaxis.get_majorticklabels()
        ylabels = ax.yaxis.get_majorticklabels()

        plt.setp(xlabels, rotation=45,fontsize=34)
        plt.setp(ylabels, rotation=0)

    fig.text(0.52, 0, "Trait Affect Subscale Score", ha='center')
    fig.text(-0.01, 0.5, 'Epoch', va='center', rotation='vertical')

    plt.savefig('results/cor/cor_' + timediv + '.png', bbox_inches = "tight")
plt.show()

In [None]:
# # Get all individual epoch/applevel features
# feats_allmetrics = master_featureset[pipeline.ALL_USERS_DIR]['wkly_epoch_applevel']


# # Add pipeline.TRAIT_AFFECT_SUBSCALES.keys()   (scores) to feats_allmetrics
# for score in pipeline.TRAIT_AFFECT_SUBSCALES.keys()  :
#     feats_allmetrics[score] = [pipeline.match_value(features_bl, 'pid', pid, score) for pid in feats_allmetrics['pid']]

# sns.set(font_scale=3)
# # Iterate through metrics (i.e., frequency, daysofuse, and duration)
# for metric in pipeline.FILENAME_MAPPINGS['metric'].keys():
    
#     # Get the subset of features for the current metric
#     features_metric = feats_allmetrics[['package'] + [col for col in feats_allmetrics.columns if metric in col] + 
#                                        list(pipeline.TRAIT_AFFECT_SUBSCALES.keys()  )]
    
#     # Find the abbreviation for the given metric (e.g. 'frequency' is denoted
#     #   with 'loy')
#     abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]

#     # Init figure and axes
#     fig, axs = plt.subplots(2, 4, sharex=True, sharey=True,figsize=(35,15))
    
    
#     fig.suptitle('Trait Affect Correlation with Weekly ' + metric.capitalize() + ' by Time of Day', 
#                  y=1.05, 
#                  fontsize=50)

#     cbar_ax = fig.add_axes([1.01, .3, .03, .4])

#     # Iterate over the axes, creating one corr plot per app
#     for i, ax in enumerate(axs.flat):
#         app = apps[i]
#         df = features_metric.loc[features_metric['package'] == app]
        
#         if metric == "duration":
#             df = df[list(df.columns)[0:5] + list(pipeline.TRAIT_AFFECT_SUBSCALES.keys()  )]

#         # Correlate the baseline scores (pipeline.TRAIT_AFFECT_SUBSCALES.keys()  ) with app-specific features
#         #  (e.g., correlate PHQ4 baseline depression with frequency for the 
#         #   "Worrknot" app)
#         res = df.corr().round(decimals=2)[pipeline.TRAIT_AFFECT_SUBSCALES.keys()  ]
        
# #         print(res)
        
#         # Identify significant correlations, and create a mask so that we only display those
#         p_values = pipeline.corr_sig(res)
#         mask = np.invert(np.tril(p_values<0.05))

#         # Construct corr plot as seaborn heatmap
#         ax.set(xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')', 
#                ylabel='',
#                title= app.capitalize())
    
#         sns.heatmap(res.iloc[:4], 
#                     vmax=1, vmin=-1, 
#                     cmap='YlGnBu', 
#                     annot=True, 
#                     #TODO: Fix display names
#                     xticklabels=pipeline.TRAIT_AFFECT_SUBSCALES.values(),
#                     yticklabels=epoch_bin_labels,
#                     ax=ax,
#                     mask=mask,
#                     cbar=(i==0),
#                     cbar_ax=None if i else cbar_ax)
        
#         # Tidy up the plot

#         xlabels = ax.xaxis.get_majorticklabels()
#         ylabels = ax.yaxis.get_majorticklabels()
        
#         plt.setp(xlabels, rotation=45, fontsize=34)
#         plt.setp(ylabels, rotation=0)

#     fig.text(0.5, 0, "Baseline Score", ha='center')
#     fig.text(-0.01, 0.5, 'Epoch', va='center', rotation='vertical')

#     plt.savefig('results/cor/wkly_' + abbrev + '.png', bbox_inches = "tight")

## Boxplots

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['study_agg'].copy()

# Convert duration to minutes, for graphing
features['duration'] = features['duration'] / 60.0 

# Get only the core engagement vars
features = pd.melt(features, 
                   id_vars=['pid'], 
                   value_vars=['frequency', 'daysofuse', 'duration'])

sns.set(font_scale=1.25)

with sns.axes_style("whitegrid"):
    fig, axs = plt.subplots(1,3, figsize=(7, 4))
    fig.suptitle('Comparison of Metrics', 
                 y=1.05,
                fontsize=20)

    for i, ax in enumerate(axs):
        metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[i]
        abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]  
        df = features[features['variable'] == metric]

        g = sns.boxplot(x='variable', 
                        y='value',
                        showfliers = False,
                        data=df, 
                        ax=ax)

        g.set_xticklabels('')
        
        title = metric.capitalize()
        if metric == "duration":
            title = title + ' (Minutes)'
        if metric == 'daysofuse':
            title = 'Days of Use'

        ax.set(title=title, xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')', ylabel='')
    
#     plt.savefig('results/box/box_study.png', bbox_inches = "tight")
    plt.show()

In [None]:
features

### Study Trait Affect Engagement Metrics

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['study_agg'].copy()
sns.set(font_scale=1.25)

n_groups = len(pipeline.TRAIT_AFFECT_GROUPS)
n_metrics = len(pipeline.FILENAME_MAPPINGS['metric'])

with sns.axes_style("whitegrid"):
    fig, axs = plt.subplots(n_groups,n_metrics, sharex=True, sharey=False, figsize=(10, 8))
    fig.suptitle('Comparison of Engagement Metrics \nAcross Study Lifetime', 
                 y=1.15)
    
    for i in range(0, n_groups):
        
        group = list(pipeline.TRAIT_AFFECT_GROUPS.keys())[i]
        group_label = pipeline.TRAIT_AFFECT_GROUPS[group]
        group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]
    
        if i == 0:
            y = 0.65
        else:
            y = 0.15
        fig.text(-.05, y, group_label.replace(' Group', ''), weight='bold', ha='center', rotation = 90)
        
        for j in range(0, n_metrics):
            ax = axs[i,j]
            metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[j]
            abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]            
       
            g = None
            outliers = None
            if metric == 'duration':
                features[metric] = features[metric] / 60.0

            g = sns.boxplot(x=features[group], 
                            y=features[metric],
                            hue=features[group], 
                            palette={'high': 'darkorange', 'low': 'royalblue'},
                            showfliers = False,
                            dodge=False,
                            data=features, 
                            ax=ax)
            
            outliers_low = [y for stat in boxplot_stats(features[features[group] == 'low'][metric]) for y in stat['fliers']]
            outliers_high = [y for stat in boxplot_stats(features[features[group] == 'high'][metric]) for y in stat['fliers']]
            print(outliers_low)
            print(outliers_high)
    
            g.set_xticklabels("")

            title = metric.capitalize()
            if metric == "duration":
                title = title + ' (Minutes)'
            if metric == 'daysofuse':
                title = 'Days of Use'

            if i == 0:            
                ax.set(title=title, xlabel= '(' + pipeline.SUBPLOT_LABELS[j] + ')', ylabel='')
            else:
                ax.set(title="", xlabel= '(' + pipeline.SUBPLOT_LABELS[j+3] + ')', ylabel='')
            
            if i == 0 and j == 2:
                g.legend(title='Trait Group',
                         bbox_to_anchor=(1.05, 1), ncol=1)
                legend = g.get_legend()
#                 plt.setp(legend.get_texts(), fontsize='28') # for legend text
#                 plt.setp(legend.get_title(), fontsize='30') # for legend title
#                 plt.setp(legend.get_lines()[1:], linewidth='6')
            else: 
                g.legend_.remove()

#     plt.savefig('results/box/box_study.png', bbox_inches = "tight")
    plt.show()

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['study_agg'].copy()
sns.set(font_scale=1.25)

n_groups = len(pipeline.TRAIT_AFFECT_GROUPS)
n_metrics = len(pipeline.FILENAME_MAPPINGS['metric'])

with sns.axes_style("whitegrid"):
    fig, axs = plt.subplots(n_groups,n_metrics, sharex=True, sharey=False, figsize=(10, 7))
    fig.suptitle('Comparison of Engagement Metrics (Mean and Std) \nAcross Study Lifetime', 
                 y=1.15)
    
    for i in range(0, n_groups):
        
        group = list(pipeline.TRAIT_AFFECT_GROUPS.keys())[i]
        group_label = pipeline.TRAIT_AFFECT_GROUPS[group]
        group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]
    
        if i == 0:
            y = 0.65
        else:
            y = 0.15
        fig.text(-.05, y, group_label.replace(' Group', ''), weight='bold', ha='center', rotation = 90)
        
        for j in range(0, n_metrics):
            ax = axs[i,j]
            metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[j]
            abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]            
       
            g = None
            outliers = None
            if metric == 'duration':
                features[metric] = features[metric] / 60.0

            g = sns.pointplot(x=features[group], 
                y=features[metric],
                hue=features[group], 
                ci = 68,
                palette={'high': 'darkorange', 'low': 'royalblue'},
                dodge=False,
                connect = False,
                ax=ax)

    
            g.set_xticklabels("")

            title = metric.capitalize()
            if metric == "duration":
                title = title + ' (Minutes)'
            if metric == 'daysofuse':
                title = 'Days of Use'

            if i == 0:            
                ax.set(title=title, xlabel= '(' + pipeline.SUBPLOT_LABELS[j] + ')', ylabel='')
            else:
                ax.set(title="", xlabel= '(' + pipeline.SUBPLOT_LABELS[j+3] + ')', ylabel='')
            
            if i == 0 and j == 2:
                g.legend(title='Trait Group',
                         bbox_to_anchor=(1.05, 1), ncol=1)
                legend = g.get_legend()
#                 plt.setp(legend.get_texts(), fontsize='28') # for legend text
#                 plt.setp(legend.get_title(), fontsize='30') # for legend title
#                 plt.setp(legend.get_lines()[1:], linewidth='6')
            else: 
                g.legend_.remove()

    plt.savefig('results/box/mean_std_study.png', bbox_inches = "tight")
    plt.show()

In [None]:
sns.set(font_scale=3)
feats_allmetrics = master_featureset[pipeline.ALL_USERS_DIR]['study_agg']

# Add pipeline.TRAIT_AFFECT_SUBSCALES.keys()   (scores) to feats_allmetrics
for score in pipeline.TRAIT_AFFECT_SUBSCALES.keys()  :
    feats_allmetrics[score] = [pipeline.match_value(features_bl, 'pid', pid, score) for pid in feats_allmetrics['pid']]

fig, ax = plt.subplots(figsize=(20, 10)) 
cbar_ax = fig.add_axes([1.01, .3, .03, .4])
 
fig.suptitle('Trait Affect Correlation with Engagement Metrics', 
             y=1.05, 
             fontsize=50)

df = feats_allmetrics[list(pipeline.FILENAME_MAPPINGS['metric'].keys()) + 
                      list(pipeline.TRAIT_AFFECT_SUBSCALES.keys()  )]

# Correlate the baseline scores (pipeline.TRAIT_AFFECT_SUBSCALES.keys()  ) with app-specific features
#  (e.g., correlate PHQ4 baseline depression with frequency for the 
#   "Worrknot" app)
res = df.corr().round(decimals=2)

# Identify significant correlations, and create a mask so that we only display those
p_values = pipeline.corr_sig(res)
mask = np.invert(np.tril(p_values<0.05))

# Construct corr plot as seaborn heatmap

# if metric == 'daysofuse':
#     title = 'Days of Use'


sns.heatmap(res, 
            vmax=1, vmin=-1, 
            cmap='YlGnBu', 
            annot=True,
            ax=ax,
            mask=mask,
            cbar=True)

# Tidy up the plot

xlabels = ax.xaxis.get_majorticklabels()
ylabels = ax.yaxis.get_majorticklabels()

plt.setp(xlabels, rotation=45,fontsize=34)
plt.setp(ylabels, rotation=0)

fig.text(0.52, 0, "Trait Affect Subscale Score", ha='center')
fig.text(-0.01, 0.5, 'Epoch', va='center', rotation='vertical')

plt.savefig('results/cor/cor_study.png', bbox_inches = "tight")
plt.show()

### Study x Epoch Trait Affect Engagement Metrics

In [None]:
# TODO: Get Study x Epoch Longform df

features = master_featureset[pipeline.APP_USERS_DIR]['study_epoch_agg_lf']
sns.set(font_scale=3)

for group, group_label in pipeline.TRAIT_AFFECT_GROUPS.items():

    group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]    
    
    with sns.axes_style("whitegrid"):
        fig, axs = plt.subplots(1,3, sharex=True, sharey=False, figsize=(25, 9))
        fig.suptitle(group_label + ' Comparison of Metrics by Epoch',  
                     y=1.05, 
                     fontsize=50)

        for i, ax in enumerate(axs):
            metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[i]
            abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]            
       
            g = None
            if metric == 'duration':
                g = sns.boxplot(x=features['epoch'], 
                                y=features[metric] / 60.0, # Convert to minutes, for display, 
                                hue=features[group], 
                                palette={'high': 'sandybrown', 'low': 'cornflowerblue'},
                                showfliers = False,
                                data=features, 
                                ax=ax)
            else:
                g = sns.boxplot(x=features['epoch'], 
                                y=features[metric],
                                hue=features[group], 
                                palette={'high': 'sandybrown', 'low': 'cornflowerblue'},
                                showfliers = False,
                                data=features, 
                                ax=ax)
    
            g.set_xticklabels(epoch_bin_labels, rotation=45)

            title = metric.capitalize()
            if metric == "duration":
                title = title + ' (Minutes)'

            if metric == 'daysofuse':
                title = 'Days of Use'

            ax.set(title=title, xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')', ylabel='')

            
            if metric == "duration":
                g.legend(title='Trait Group',
                         bbox_to_anchor=(1.05, 1), ncol=1).texts[0].set_text('')
                legend = g.get_legend()
                plt.setp(legend.get_texts(), fontsize='28') # for legend text
                plt.setp(legend.get_title(), fontsize='30') # for legend title
                plt.setp(legend.get_lines()[1:], linewidth='6')
            else: 
                g.legend_.remove()

    fig.text(0.52, 0, 'Epoch', ha='center')

    #                 ax.legend(title="Baseline Group")

#             plt.setp(ax.get_legend().get_lines(), linewidth='4')

#         plt.gcf().subplots_adjust(bottom=0.15)
    
    plt.savefig('results/box/box_study_epoch_' + group_abbrev + '.png', bbox_inches = "tight")
    plt.show()

### Weekly x Epoch Trait Affect Engagement Metrics

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['wkly_epoch_agg_lf']
sns.set(font_scale=3)

for group, group_label in pipeline.TRAIT_AFFECT_GROUPS.items():

    group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]    
    
    with sns.axes_style("whitegrid"):
        fig, axs = plt.subplots(1,3, sharex=True, sharey=False, figsize=(25, 9))
        fig.suptitle(group_label + ' Comparison of Weekly Metrics by Epoch', 
                     y=1.05, 
                     fontsize=50)

        for i, ax in enumerate(axs):
            metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[i]
            abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]            
       
            g = None
            if metric == 'duration':
                g = sns.boxplot(x=features['epoch'], 
                                y=features[metric] / 60.0, # Convert to minutes, for display, 
                                hue=features[group], 
                                palette={'high': 'sandybrown', 'low': 'cornflowerblue'},
                                showfliers = False,
                                data=features, 
                                ax=ax)
            else:
                g = sns.boxplot(x=features['epoch'], 
                                y=features[metric],
                                hue=features[group], 
                                palette={'high': 'sandybrown', 'low': 'cornflowerblue'},
                                showfliers = False,
                                data=features, 
                                ax=ax)
    
            g.set_xticklabels(epoch_bin_labels, rotation=45)

            title = metric.capitalize()
            if metric == "duration":
                title = title + ' (Minutes)'

            if metric == 'daysofuse':
                title = 'Days of Use'

            ax.set(title=title, xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')', ylabel='')
            
            if metric == "duration":
                g.legend(title='Trait Group',
                         bbox_to_anchor=(1.05, 1), ncol=1)
                legend = g.get_legend()
                plt.setp(legend.get_texts(), fontsize='28') # for legend text
                plt.setp(legend.get_title(), fontsize='30') # for legend title
                plt.setp(legend.get_lines()[1:], linewidth='6')
            else: 
                g.legend_.remove()


    fig.text(0.52, 0, 'Epoch', ha='center')

    #                 ax.legend(title="Baseline Group")

#             plt.setp(ax.get_legend().get_lines(), linewidth='4')

#         plt.gcf().subplots_adjust(bottom=0.15)
    
    plt.savefig('results/box/box_wkly_epoch_' + group_abbrev + '.png', bbox_inches = "tight")
    plt.show()

## Line Plots

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg'].copy()

# Convert duration to minutes, for graphing
features['duration'] = features['duration'] / 60.0 

# Get only the core engagement vars
features = pd.melt(features, 
                   id_vars=['pid','weekofstudy'], 
                   value_vars=['frequency', 'daysofuse', 'duration'])
features

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg'].copy()

# Convert duration to minutes, for graphing
features['duration'] = features['duration'] / 60.0 

# Get only the core engagement vars
features = pd.melt(features, 
                   id_vars=['pid','weekofstudy'], 
                   value_vars=['frequency', 'daysofuse', 'duration'])

sns.set(font_scale=1.25)

with sns.axes_style("whitegrid"):
    fig, axs= plt.subplots(1,3, figsize=(12, 4))
    fig.suptitle('Comparison of Metrics', 
                 y=1.05,
                fontsize=20)

    for i, ax in enumerate(axs):
        metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[i]
        abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]  
        df = features[features['variable'] == metric]

        g = sns.lineplot(x='weekofstudy', 
                        y='value',
                        data=df, 
                        ci=68,
                        ax=ax)
        
        title = metric.capitalize()
        if metric == "duration":
            title = title + ' (Minutes)'
        if metric == 'daysofuse':
            title = 'Days of Use'

        ax.set(title=title, xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')', ylabel='')
    fig.text(0.5, 0, 'Week of Study', ha='center')
    plt.savefig('results/line/line_study.png', bbox_inches = "tight")
    plt.show()

### Weekly trait affect group comparison of engagement metrics

In [None]:
df = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg']


In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg'].copy()
features['duration'] = features['duration'] / 60.0 

ylims = {
    'frequency': [0, 55],
    'daysofuse': [0, 7.5],
    'duration': [0, 35]
}


n_groups = len(pipeline.TRAIT_AFFECT_GROUPS)
n_metrics = len(pipeline.FILENAME_MAPPINGS['metric'])

sns.set(font_scale=1.5)

with sns.axes_style("whitegrid"):
    fig, axs = plt.subplots(n_groups,n_metrics, sharex=True, sharey=False, figsize=(16, 8))
#     fig.suptitle(group_label+ ' Comparison of Weekly Engagement Metrics', 
#                  y=1.15)
    
    for i in range(0, n_groups):
        group = list(pipeline.TRAIT_AFFECT_GROUPS.keys())[i]
        group_label = pipeline.TRAIT_AFFECT_GROUPS[group]
        group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]
        
        df2 = features.groupby([group])['pid'].nunique().reset_index(name='n_participants')

        
#       Addition of random undersampling
#         subgroup = df2.iloc[df2['n_participants'].argmax(), :][group]
#         pids = list(df[df[group]==subgroup]['pid'].unique())
#         n_to_exclude = df2['n_participants'].argmax() - df2['n_participants'].argmin()
#         pids_to_exclude = random.sample(pids, n_to_exclude)
#         features = features[~features['pid'].isin(pids_to_exclude)]
        
        
        if i == 0:
            y = 0.6
        else:
            y = 0.1
            
        # Correction from Phil: Trait -> Baseline (needs to be done throughout)
        fig.text(-.01, y, group_label.replace('Trait', 'Baseline').replace(' Group', ''), weight='bold', ha='center', rotation = 90)
        
        for j in range(0, n_metrics):
            ax = axs[i,j]
            metric = list(pipeline.FILENAME_MAPPINGS['metric'].keys())[j]
            abbrev = pipeline.FILENAME_MAPPINGS['metric'][metric]            
       
            g = sns.lineplot(x='weekofstudy', 
                             y= metric,
                             hue=group,
                             palette={'high': 'darkorange', 'low': 'royalblue'},
                             ci=68,
                             data=features, 
                             ax=ax)
        
            ax.set_ylim(ylims[metric])

            title = metric.capitalize()
            if metric == "duration":
                title = title + ' (Minutes)'
            if metric == 'daysofuse':
                title = 'Days of Use'
                        
            if i > 0:          
                title = ''
            
            ax.set(title=title, adjustable='datalim', xlim=(0,8),
                   xlabel= '', ylabel='')
            
            if i == 0 and j == 2:
                g.legend(title='Baseline Group',
                         bbox_to_anchor=(1.05, 1), ncol=1).texts[0].set_text('')
                legend = g.get_legend()
#                 plt.setp(legend.get_texts(), fontsize='28') # for legend text
#                 plt.setp(legend.get_title(), fontsize='30') # for legend title
                plt.setp(legend.get_lines()[1:], linewidth='6')
            else: 
                g.legend_.remove()
            
            fig.text(0.165, -0.05, '(' + pipeline.SUBPLOT_LABELS[3] + ')', ha='center')
            fig.text(0.475, -0.05, '(' + pipeline.SUBPLOT_LABELS[4] + ')', ha='center')
            fig.text(0.75, -0.05, '(' + pipeline.SUBPLOT_LABELS[5] + ')', ha='center')
            
            fig.text(0.165, 0.49, '(' + pipeline.SUBPLOT_LABELS[0] + ')', ha='center')
            fig.text(0.475, 0.49, '(' + pipeline.SUBPLOT_LABELS[1] + ')', ha='center')
            fig.text(0.75,  0.49, '(' + pipeline.SUBPLOT_LABELS[2] + ')', ha='center')
            
            
    fig.text(0.475, 0, 'Week of Study', ha='center')
    plt.savefig('results/line/line_eng_68ci_nounder.png', bbox_inches = "tight")
    plt.show()

### Weekly trait affect group comparison of Weekly affect change

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg'].copy()
sns.set(font_scale=3)
for group, group_label in pipeline.TRAIT_AFFECT_GROUPS.items():
    
    group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]    

    with sns.axes_style("whitegrid"):
        fig, axs = plt.subplots(1,2, sharex=True, sharey=False, figsize=(20, 8)) 

        
        fig.suptitle(group_label + ' Comparison of Weekly Affect', 
                     x=0.5,
                     y=1.05, 
                     fontsize=50)
        
        for i, ax in enumerate(axs.flat):
            wkly_affect = list(pipeline.WKLY_AFFECT.keys())[i]
            title = pipeline.WKLY_AFFECT[wkly_affect]
            
            group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]
            
            g = sns.lineplot(x='weekofstudy', 
                         y= wkly_affect,
                         hue=group,
                         palette={'high': 'sandybrown', 'low': 'cornflowerblue'},
                         data=features, 
                         err_style="bars",
                         ci=68,
                         ax=ax)

            wkly_affect = list(pipeline.WKLY_AFFECT.keys())[i]
            
            ax.set(adjustable='datalim',
                   xlim=(0,8),
                   ylim=(0,5.5),
                   xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')', 
                   ylabel = '',
                   title=title)
            
#             ax.set_title(metric.capitalize(), fontsize='42')
            
            if wkly_affect == 'dep':
                g.legend(title='Trait Group',
                         bbox_to_anchor=(1, 1.05), ncol=1).texts[0].set_text('')
                legend = g.get_legend()
                plt.setp(legend.get_texts(), fontsize='20') # for legend text
                plt.setp(legend.get_title(), fontsize='22') # for legend title
                plt.setp(legend.get_lines()[1:], linewidth='6')
            else: 
                g.legend_.remove()
        
            for line in ax.lines:
                plt.setp(ax.lines,linewidth=4) 
            
                        
            for subgroup in ['low', 'high']:
                x = features[features[group]==subgroup]['weekofstudy']
                y = features[features[group]==subgroup][wkly_affect]
                idx = np.isfinite(x) & np.isfinite(y)
                z = np.polyfit(x[idx], y[idx], 1)
                p = np.poly1d(z)
                if subgroup == 'low':
                    c = 'cornflowerblue'
                else:
                    c = 'sandybrown'
                ax.plot(x, p(x), c=c)
                
       
        fig.text(0.44, 0, 'Week of Study', ha='center')
        plt.savefig('results/line/line_' + group_abbrev + '_wklyaffect.png', bbox_inches = "tight")
        plt.show()

In [None]:
features = master_featureset[pipeline.APP_USERS_DIR]['wkly_agg'].copy()
sns.set(font_scale=1.5)

with sns.axes_style("whitegrid"):
    fig, axs = plt.subplots(1, 2, sharex=True, sharey=False, figsize=(12, 5)) 
    
    fig.suptitle(group_label + ' Comparison of Weekly Affect', 
                 x=0.5,
                 y=1.05, 
                 fontsize=20)
    
    for i, ax in enumerate(axs.flat):
        group = list(pipeline.TRAIT_AFFECT_GROUPS.keys())[i]
        print(group)
        group_label = pipeline.TRAIT_AFFECT_GROUPS[group]
        group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]
    
        if 'anx' in group:
            wkly_affect = 'anx'
        else:
            wkly_affect = 'dep'

        g = sns.lineplot(x='weekofstudy', 
                     y= wkly_affect,
                     hue=group,
                     palette={'high': 'sandybrown', 'low': 'cornflowerblue'},
                     data=features, 
                     err_style="bars",
                     ci=68,
                     ax=ax)

        ax.set(adjustable='datalim',
               xlim=(0,8),
               ylim=(0,5.5),
               title = '',
               xlabel= '', 
               ylabel = pipeline.WKLY_AFFECT[wkly_affect] + ' Score')

    #             ax.set_title(metric.capitalize(), fontsize='42')
        if wkly_affect == 'dep':
            g.legend(title='Trait Group',
                     bbox_to_anchor=(1, 1.05), ncol=1).texts[0].set_text('')
            legend = g.get_legend()
            plt.setp(legend.get_texts(), fontsize='14') # for legend text
            plt.setp(legend.get_title(), fontsize='16') # for legend title
            plt.setp(legend.get_lines()[1:], linewidth='6')
        else:
            g.legend().remove()

        for line in ax.lines:
            plt.setp(ax.lines,linewidth=3) 


        for subgroup in ['low', 'high']:
            x = features[features[group]==subgroup]['weekofstudy']
            y = features[features[group]==subgroup][wkly_affect]
            idx = np.isfinite(x) & np.isfinite(y)
            z = np.polyfit(x[idx], y[idx], 1)
            p = np.poly1d(z)
            if subgroup == 'low':
                c = 'cornflowerblue'
            else:
                c = 'sandybrown'
            ax.plot(x, p(x), c=c)
                
fig.text(0.249, -0.10, '(' + pipeline.SUBPLOT_LABELS[0] + ')', ha='center')
fig.text(0.675, -0.10, '(' + pipeline.SUBPLOT_LABELS[1] + ')', ha='center')
            
fig.text(0.455, 0, 'Week of Study', ha='center')
plt.savefig('results/line/line_wklyaffect.png', bbox_inches = "tight")
plt.show()

In [None]:
pipeline.FILENAME_MAPPINGS['group']

In [None]:
features = master_featureset[pipeline.ALL_USERS_DIR]['wkly_trait_state']

# Indicator (step) function showing net group change in affect by week
# For instance, in the high trait anxious group, let's say we have three people with affect score changes +1 point,
#  +3 points, and -2 points between any two given weeks. 1 + 3 - 2 = 2, 
#  so the indicator function is set to 1 indicating a net positive change between the two weeks.

sns.set(font_scale=1.5)
    
for group, group_label in pipeline.TRAIT_AFFECT_GROUPS.items():
    group_abbrev = pipeline.FILENAME_MAPPINGS['group'][group]    
    with sns.axes_style("whitegrid"):
        fig, axs = plt.subplots(2,2, sharex=True, sharey=True, figsize=(10, 6))
        fig.suptitle(group_label + ' Comparison of Change in Weekly Affect', 
                     x=0.5,
                     y=1.05, 
                     fontsize=20)

        for j in range(0,2):
            affect = list(pipeline.WKLY_AFFECT.keys())[j]
            print(affect)
            affect_change = affect + '_change'
            features[affect_change] = np.nan
            features[affect_change] = features[affect] - features[affect].shift(1)

            df = features.groupby([group, 'weekofstudy'])[affect_change].sum().reset_index()
            df[affect_change] = df[affect_change].apply(lambda x: 1 if x > 0 else x)
            df[affect_change] = df[affect_change].apply(lambda x: -1 if x < 0  else x)

            high_df = df[df[group] == 'high']
            low_df = df[df[group] == 'low']
            
            ax = axs[0, j]
            ax.set(ylabel='Low Group')

            g = sns.lineplot(x='weekofstudy', 
                             y=affect_change,
                             hue=group,
                             palette={'low': 'cornflowerblue'},
                             drawstyle='steps-pre',
                             data=low_df,
                             ax=ax)
            
#             if j == 1:
#                 g.legend(title='Trait Group',
#                          bbox_to_anchor=(1.05, 2), ncol=1).texts[0].set_text('')
#                 legend = g.get_legend()
#                 plt.setp(legend.get_texts(), fontsize='12') # for legend text
#                 plt.setp(legend.get_title(), fontsize='14') # for legend title
#                 plt.setp(legend.get_lines()[1:], linewidth='6')
#             else: 
#                 g.legend_.remove()
            
            g.legend_.remove()

            ax = axs[1, j]
            ax.set(ylabel='High Group')

            g = sns.lineplot(x='weekofstudy', 
                             y=affect_change,
                             hue=group,
                             palette={'high': 'sandybrown'},
                             drawstyle='steps-pre',
                             data=high_df,
                             ax=ax)
            
            g.legend_.remove()




            axs[0,j].set(title='Weekly ' + affect.capitalize())

#             if j == 1:
#                 g.legend(title='Trait Group',
#                          bbox_to_anchor=(1.05, 2), ncol=1).texts[0].set_text('')
#                 legend = g.get_legend()
#                 plt.setp(legend.get_texts(), fontsize='12') # for legend text
#                 plt.setp(legend.get_title(), fontsize='14') # for legend title
#                 plt.setp(legend.get_lines()[1:], linewidth='6')
#             else: 
#                 g.legend_.remove()
        for i, ax in enumerate(axs.flat):
            print(i)
            ax.set(adjustable='datalim',
                   xlim=(0,8),
                   xlabel= '(' + pipeline.SUBPLOT_LABELS[i] + ')')
            for line in ax.lines:
                plt.setp(ax.lines,linewidth=3) 

fig.text(0.5, 0, 'Week of Study', ha='center')
            
#     plt.savefig('results/line/line_' + group_abbrev + '_affect_indicator.png', bbox_inches = "tight")
plt.show()
        
        