# Preprocessing 

In [1]:
import pandas as pd
import math
import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Raw Momentary Data
momentary_data = pd.read_csv("Momentary_Data.csv")
momentary_data.head()

Unnamed: 0,ID,date,study_day,school,TextSent.morning,TextReceived.morning,Stressed.morning,Anxious.morning,Depressed.morning,CallsMade.morning,...,Depressed.night,CallsMade.night,CallsReceived.night,mean_EMA_Stressed,mean_EMA_Anxious,mean_EMA_Depressed,mean_EMA_TextSent,mean_EMA_TextReceived,mean_EMA_CallsMade,mean_EMA_CallsReceived
0,1001,5/2/16,1,1,,,,,,1.0,...,,0.0,1.0,4.307692,3.730769,2.544872,0.749141,1.197595,1.355039,1.268217
1,1001,5/3/16,2,1,0.0,1.0,,,,0.0,...,,2.0,1.0,4.307692,3.730769,2.544872,0.749141,1.197595,1.355039,1.268217
2,1001,5/4/16,3,1,0.0,2.0,,,,1.0,...,,3.0,0.0,4.307692,3.730769,2.544872,0.749141,1.197595,1.355039,1.268217
3,1001,5/5/16,4,1,,,,,,4.0,...,,2.0,0.0,4.307692,3.730769,2.544872,0.749141,1.197595,1.355039,1.268217
4,1001,5/6/16,5,1,,,,,,,...,,2.0,3.0,4.307692,3.730769,2.544872,0.749141,1.197595,1.355039,1.268217


In [3]:
# Raw Monthly Data
monthly_data = pd.read_csv('Monthly_Data.csv')
monthly_data.head()

Unnamed: 0,ID,Month.Number,school,age,date,Number.of.days,EPISODICNUM,EPISODICSEV,EPISODICAVG,EPISODICTOT,...,pcila.pmc,pcila.pmc.past,tola,tola.gmc,tola.pmc,tola.pmc.past,tila,tila.gmc,tila.pmc,tila.pmc.past
0,1001,0,1,16.88,,,1.0,3.0,3.0,3.0,...,,,,-1042.290125,,,,-1169.617826,,
1,1001,1,1,16.88,6/7/16,36.0,1.0,3.0,3.0,3.0,...,53.127273,,73.5,-1042.290125,-14.045455,,163.9,-1169.617826,52.181818,
2,1001,2,1,16.88,7/16/16,39.0,1.0,2.0,2.0,2.0,...,-7.772727,53.127273,68.1,-1042.290125,-19.445455,-14.045455,90.6,-1169.617826,-21.118182,52.181818
3,1001,3,0,16.88,8/13/16,28.0,1.0,2.0,2.0,2.0,...,-18.372727,-7.772727,48.8,-1042.290125,-38.745455,-19.445455,79.1,-1169.617826,-32.618182,-21.118182
4,1001,4,0,16.88,9/6/16,24.0,2.0,4.0,3.0,6.0,...,66.027273,-18.372727,18.9,-1042.290125,-68.645455,-38.745455,32.4,-1169.617826,-79.318182,-32.618182


In [4]:
# Construct dataframe with desired features for model 
data_storage = {'new_id':[],
                'day' : [], 
                'date': [], 
               'Anxious.morning':[],
               'Anxious.afternoon':[],
               'Anxious.night':[], 
               'gad7_total': [], 
               'month_start_date': [], 
               'month_end_date': []}

In [5]:
# Reformat data into "person-months", where a month's worth of data for each person is treated as a separate individual
# Size of months = 28 days; 28 days * 3 anxiety measurements/day = 84 maximum possible anxiety data points 
# Each row will also have the GAD-7 score that was measured at the end of the month 

unique_ids = sorted(list(set(monthly_data['ID'])))
new_ids = [] 
days = [] 
dates = []
anxious_mornings = [] # morning anxiety EMA data points
anxious_afternoons = [] # afternoon anxiety EMA data points
anxious_nights = [] # night anxiety EMA data points
gad7_totals = [] # GAD-7 score at end of current month 
month_start_dates = [] 
month_end_dates = []

for un_id in unique_ids:
    curr_person_monthly = monthly_data.loc[monthly_data['ID'] == un_id]
    curr_person_monthly.reset_index(inplace=True) # need to reset index
    curr_person_momentary = momentary_data.loc[momentary_data['ID'] == un_id]
    curr_person_momentary['date'] = pd.to_datetime(curr_person_momentary['date'])
    curr_person_momentary.reset_index(inplace=True) # need to reset index
    curr_person_momentary.rename(columns={'Anxious.morning': 'AnxiousMorning', 'Anxious.afternoon': 'AnxiousAfternoon' , 'Anxious.night': 'AnxiousNight'}, inplace = True)
    
    for ind, month in enumerate(curr_person_monthly['date']):
        if ind != len(curr_person_monthly['date']) - 1: 
            if ind == 0: 
                if not pd.isnull(curr_person_monthly['date'][1]): # make sure next month string exists
                    if '/' in curr_person_monthly['date'][1]: # make sure both are strings 
                        time_delta = datetime.datetime.strptime(monthly_data['date'][1], '%m/%d/%y') - curr_person_momentary['date'][0]
                        if time_delta >= pd.Timedelta(28, unit="d"): # each person month needs to be 28 days
                            start_date = datetime.datetime.strptime(curr_person_monthly['date'][1], '%m/%d/%y') - pd.Timedelta(27, unit="d")
                            end_date = datetime.datetime.strptime(curr_person_monthly['date'][ind+1], '%m/%d/%y')
                            curr_date = start_date
                            monthly_gad = curr_person_monthly['gad7_total'][ind+1]
                            count = 0
                            curr_month_id = str(un_id) + '_' + str(ind)
                            while count < 28: 
                                days.append(count)
                                dates.append(curr_date)
                                new_ids.append(curr_month_id)
                                anxious_mornings.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousMorning.item())
                                anxious_afternoons.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousAfternoon.item())
                                anxious_nights.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousNight.item())
                                gad7_totals.append(monthly_gad)
                                month_start_dates.append(start_date)
                                month_end_dates.append(end_date)
                                
                                curr_date = curr_date + pd.Timedelta(1, unit="d")
                                count += 1 
            elif ind == len(curr_person_monthly['date']) -2: # last month never has a date; must do some manual checking 
                if not pd.isnull(curr_person_monthly['date'][ind]): # make sure this month string exists                    
                    if '/' in curr_person_monthly['date'][ind]: # make sure this month is a date string
                        last_study_date = curr_person_momentary['date'][len(curr_person_momentary['date']) - 1] 
                        time_delta = last_study_date - datetime.datetime.strptime(monthly_data['date'][ind], '%m/%d/%y') 
                        if time_delta >= pd.Timedelta(28, unit="d"):
                            start_date = last_study_date - pd.Timedelta(27, unit="d")
                            end_date = last_study_date
                            curr_date = start_date
                            monthly_gad = curr_person_monthly['gad7_total'][ind+1]
                            count = 0
                            curr_month_id = str(un_id) + '_' + str(ind)
                            while count < 28: 
                                days.append(count)
                                dates.append(curr_date)
                                new_ids.append(curr_month_id)
                                anxious_mornings.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousMorning.item())
                                anxious_afternoons.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousAfternoon.item())
                                anxious_nights.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousNight.item())
                                gad7_totals.append(monthly_gad)
                                month_start_dates.append(start_date)
                                month_end_dates.append(end_date)
                                
                                curr_date = curr_date + pd.Timedelta(1, unit="d")
                                count += 1 
            else: # anything for ind = 2 to ind = len(dates) - 3
                if not pd.isnull(curr_person_monthly['date'][ind]) and not pd.isnull(curr_person_monthly['date'][ind+1]): # make sure both date strings are non-null
                    if '/' in curr_person_monthly['date'][ind] and '/' in curr_person_monthly['date'][ind+1]: # make sure both are strings 
                        time_delta = datetime.datetime.strptime(monthly_data['date'][ind+1], '%m/%d/%y') - datetime.datetime.strptime(monthly_data['date'][ind], '%m/%d/%y')
                        if time_delta >= pd.Timedelta(28, unit="d"):
                            start_date = datetime.datetime.strptime(curr_person_monthly['date'][ind+1], '%m/%d/%y') - pd.Timedelta(27, unit="d")
                            end_date = datetime.datetime.strptime(curr_person_monthly['date'][ind+1], '%m/%d/%y')
                            curr_date = start_date
                            monthly_gad = curr_person_monthly['gad7_total'][ind+1]
                            count = 0
                            curr_month_id = str(un_id) + '_' + str(ind)
                            while count < 28: 
                                days.append(count)
                                dates.append(curr_date)
                                new_ids.append(curr_month_id)
                                anxious_mornings.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousMorning.item())
                                anxious_afternoons.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousAfternoon.item())
                                anxious_nights.append(curr_person_momentary[curr_person_momentary.date == curr_date].AnxiousNight.item())
                                gad7_totals.append(monthly_gad)
                                month_start_dates.append(start_date)
                                month_end_dates.append(end_date)
                                
                                curr_date = curr_date + pd.Timedelta(1, unit="d")
                                count += 1 

data_storage['new_id'] = new_ids 
data_storage['day'] = days
data_storage['date'] = dates
data_storage['Anxious.morning'] = anxious_mornings
data_storage['Anxious.afternoon'] = anxious_afternoons
data_storage['Anxious.night'] = anxious_nights
data_storage['gad7_total'] = gad7_totals
data_storage['month_start_date'] = month_start_dates
data_storage['month_end_date'] = month_end_dates 

In [6]:
# Formatted data 
formatted_data_df = pd.DataFrame(data_storage)
formatted_data_df.head()

Unnamed: 0,new_id,day,date,Anxious.morning,Anxious.afternoon,Anxious.night,gad7_total,month_start_date,month_end_date
0,1001_0,0,2016-05-11,,,,5.0,2016-05-11,2016-06-07
1,1001_0,1,2016-05-12,,,,5.0,2016-05-11,2016-06-07
2,1001_0,2,2016-05-13,,,,5.0,2016-05-11,2016-06-07
3,1001_0,3,2016-05-14,,,,5.0,2016-05-11,2016-06-07
4,1001_0,4,2016-05-15,,,,5.0,2016-05-11,2016-06-07


In [7]:
formatted_data_df.to_csv('Formatted_Data.csv', index=False)