In [None]:
"""
@author: abaglione and lihuacai

Credit to Tyler Spears and Sonia Baee, who developed the precursor
to this preprocessing script
"""

# imports
import sys
import os
import functools
import pathlib
import glob
import collections
import itertools
import re
import random
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

import numpy as np
import pandas as pd
import pipeline

from sklearn import impute
from sklearn import datasets
from sklearn import svm, linear_model, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import scipy
from scipy.spatial.distance import cdist

# visualization libraries
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2

## Loading & Cleaning

In [None]:
# Pull in data file paths and associated names
# Set up data directories
datafiles = pipeline.gather_files(pipeline.DATA_CLEANED_DIR)

# Import data into dataframes and standardize column names and values
items = list(datafiles.items())
master_dataset = {}

In [None]:
# Clean and standardize the data
for f, fname in items:
    df = pd.read_csv(f, parse_dates=True)

    # Record which file (dataset) this dataframe came from
    df.insert(0, 'dataset', fname)
    
    # Make colnames lowercase (thank you, Chris Albon!)
    df.columns = map(str.lower, df.columns)

    # Remove irrelevant PID cols - special case
    part_id_names = ['eim_id', 'pid_code', 'id']
    possible_irrelevant = 'pid'
    
    part_id_cols = [col for col in df.columns if col in part_id_names]
        
    if possible_irrelevant in df.columns and len(part_id_cols) > 0:
        df.drop(possible_irrelevant, axis=1, inplace=True)
    
    # Standardize participant ids
    standard_part_id_name = 'pid'
    
    if len(part_id_cols) > 0:
        id_change = [
            (i if col in part_id_names else -1) for i, col in enumerate(df.columns)
        ]

        id_change = df.columns[sorted(id_change)[-1]]
        df.rename(index=str, columns={id_change: standard_part_id_name}, inplace=True)

    # Remove 'EIM' prefix from pid
    df['pid'] = df['pid'].replace("^(EIM)", "", regex=True)
    df['pid'] = pd.to_numeric(df['pid'], errors='coerce')
    df = df.dropna(subset=['pid'])
    df['pid'] = df['pid'].astype(int)
    
    # Correction of PHQ4_change col in pre_post scores sheet
    if fname == 'pre_post':
        df['phq4_change'] = df['phq_post'] - df['phq_bl']
    
    # Fix PHQ-4 column for baseline survey
    if fname == 'blsurvey':
        df.drop('phq4', axis=1, inplace=True)
        df.rename(columns={'sc0': 'phq4'}, inplace=True)

    if fname == 'app_launch':            
        # Standardize app names
        df['package'] = df['package'].replace('^(edu.northwestern.cbits.intellicare.)', '', regex=True)
        
        # Filter out push notifications
        df = df[df.package != 'conductor']
        
#         # Convert duration to minutes
#         df['duration'] = df['duration'] / 60.0
#         df['duration'] = df['duration'].round(1)

        # Remove outliers
        df = df[np.abs(scipy.stats.zscore(df['duration'])) < 3]
        
    # Standardize date columns
    date_keywords = ['timestamp', 'date']
    date_cols = [col for col in df.columns if any(keyword in col for keyword in date_keywords)]
    for col in date_cols:
        df[col] = pd.to_datetime(df[col])
        
    if fname != 'fwsurveys':
        to_drop = [col for col in date_cols if 'timestamp' in col and 'new' not in col]  
        df.drop(to_drop, axis=1, inplace=True)
    
    # Drop columns related to enrollment date that Lee generated, for weekly surveys
    if fname == 'wklysurvey':
        to_drop = [col for col in df.columns if 'enroll' in col]  
        df.drop(to_drop, axis=1, inplace=True) 
                
    if fname == 'wklysurvey_timestamps':
        df.rename(index=str, columns={'sent (0/1)': 'sent',
                                     'completed? (0/1)': 'completed'}, inplace=True)
        
    if fname == 'fwsurveys' or fname == 'wklysurvey':
        # Rename columns for easier reading
        df.rename(index=str, columns=pipeline.CODEBOOK_MAPPINGS, inplace=True)
        
        # Convert the depression mood to its reverse scale to be consistent with anxiety mood
        # The higher the anxiety and depression mood scores are, the worse they are (e.g., more anxious or depressed)
        df['state_dep'] = df['state_dep'].map({1:5,2:4,3:3,4:2,5:1})

    # Save out to CSVs 
    # We'll call these the 'processed' files, since the ones we read in were already somewhat clean
    fpath = pathlib.Path.joinpath(pipeline.DATA_PROCESSED_DIR, fname + '_processed.csv')
    df.to_csv(fpath)
    
    # Group dataframes into one list 
    master_dataset[fname] = df
