In [1]:
import json 
import pandas as pd
import os
import numpy as np
import datetime
from datetime import timedelta
import matplotlib
matplotlib.use('Agg')
# %matplotlib inline
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import get_data

# display all columns and rows
pd.set_option('display.max_colwidth',-1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# prepare ema stress data

In [2]:
DIRECTORY = 'dataset/EMA/response/Stress'
for file in sorted(os.listdir(DIRECTORY)):
    if file[0] != '.':
#         print(file)
        stress_file = file[0:10]
        get_data.get_daily_stress(stress_file)
        
# concat all users' tables
count = 1
stress_all = pd.DataFrame(['user', 'date', 'stress_level'])
for file in sorted(os.listdir('daily_stress')):
    if file[0] != '.':
        if count == 1:
            stress_all = pd.read_csv('daily_stress/'+file)
        else:
            stress_user = pd.read_csv('daily_stress/'+file)
            stress_all = pd.concat([stress_all, stress_user])
        count += 1
stress_all['date'] = pd.to_datetime(stress_all['date'])
stress_all.head()

Unnamed: 0,user,date,stress_level
0,0,2013-03-25,2.0
1,0,2013-03-26,1.0
2,0,2013-03-27,1.0
3,0,2013-03-28,3.166667
4,0,2013-03-29,2.5


# prepare sensing data

In [4]:
# get activity and audio data
get_data.get_sensing_table(0,19)
get_data.get_sensing_table(1,19)

In [5]:
# get all processed feature tables

def get_table(csv_file):    
    csv_file['time'] = pd.to_datetime(csv_file['time'])
    return csv_file

print(datetime.datetime.now())
act = get_table(pd.read_csv('activity.csv'))
audio = get_table(pd.read_csv('audio.csv'))
print(datetime.datetime.now())

2020-08-12 22:19:08.496691
2020-08-12 22:19:22.228429


In [6]:
act.tail(3)

Unnamed: 0,stationary,walking,running,activity_unknown,stationary_cumu,stationary_cumu_normal,walking_cumu,walking_cumu_normal,running_cumu,running_cumu_normal,activity_unknown_cumu,activity_unknown_cumu_normal,time,user
3657597,15,4,0,41,63364,0.999606,6790,0.99721,3032,1.0,13094,0.994305,2013-05-31 23:57:00,59
3657598,0,19,0,41,63364,0.999606,6809,1.0,3032,1.0,13135,0.997418,2013-05-31 23:58:00,59
3657599,25,0,0,34,63389,1.0,6809,1.0,3032,1.0,13169,1.0,2013-05-31 23:59:00,59


In [7]:
audio.tail(3)

Unnamed: 0,silence,voice,noise,audio_unknown,silence_cumu,silence_cumu_normal,voice_cumu,voice_cumu_normal,noise_cumu,noise_cumu_normal,audio_unknown_cumu,audio_unknown_cumu_normal,time,user
3656157,0,0,60,0,45292,1.0,8585,0.999767,32403,0.996372,0,0,2013-05-31 23:57:00,59
3656158,0,2,58,0,45292,1.0,8587,1.0,32461,0.998155,0,0,2013-05-31 23:58:00,59
3656159,0,0,60,0,45292,1.0,8587,1.0,32521,1.0,0,0,2013-05-31 23:59:00,59


# compute Stability Index

In [8]:
TRAINING_WINDOW = 14 # number of days over which we compute the Stability Index
BLOCK_WINDOW = 0 # number of days prior to each ema response we block

def get_distance(ts1, ts2, mode):
    diff = ts1 - ts2        
    diff = diff.abs()    
    if 'mean' in mode:
        return diff.mean()
    elif 'median' in mode or 'med' in mode:
        return diff.median()
    elif 'max' in mode:
        return diff.max()
    else:
        print('invalid mode: 0: mean; 1: median; 2: max')
        return None

In [9]:
def compute_stability_index(behavior, behavior_name, NUM_DAYS_USED, NUM_DAYS_BLOCKED, FILTERING):
    table_all_behav = pd.DataFrame()
    for user in stress_all['user'].unique():
        table_user = stress_all.loc[stress_all.user == user].copy()

        behavior_user = behavior.loc[behavior.user == user]
        behavior_user = behavior_user.set_index('time')
        behavior_user = behavior_user.sort_index()
        behavior_user = behavior_user.reset_index()  
        
        # set of features       
        normal_cumu_med_mean_diff = []
        absolute_cumu_med_mean_diff = []
        raw_med_mean_diff = []
        
        behavior_mean = []
        behavior_std = []
        num_good_days = []
        
        # pre processing normal, absolute, raw
        behavior_user_cumu = behavior_user[['time', behavior_name + '_cumu']]
        behavior_user_cumu_normal = behavior_user[['time', behavior_name + '_cumu_normal']]
        behavior_user_raw = behavior_user[['time', behavior_name]]
        
        behavior_user_cumu = behavior_user_cumu.set_index('time')
        behavior_user_cumu_normal = behavior_user_cumu_normal.set_index('time')
        behavior_user_raw = behavior_user_raw.set_index('time')

        # get the list of available days in this behavior data for this user
        user_days = []
        for date in behavior_user_cumu.index:
            timestampStr = date.strftime("%Y-%m-%d")
            if timestampStr not in user_days:
                user_days.append(timestampStr)
        for u_day in range(len(user_days)):
            user_days[u_day] = pd.to_datetime(user_days[u_day])
                
        # compute feature for each ema day
        for i in range(len(table_user['date'])):
            diff_mean = []
            diff_normal_mean = []
            behavior_amount = []
            diff_raw_mean = []
            
            this_ema_day = table_user.iloc[i]['date']
            starting_day = this_ema_day - timedelta(days=NUM_DAYS_USED + NUM_DAYS_BLOCKED - 1)
            if NUM_DAYS_BLOCKED > 0:
                days_unused = this_ema_day - pd.to_timedelta(np.arange(NUM_DAYS_BLOCKED), 'd')
                days_used = [u_f for u_f in user_days if u_f < this_ema_day and u_f >= starting_day]
            else:
                days_unused = []
                days_used = [u_f for u_f in user_days if u_f <= this_ema_day and u_f >= starting_day]
            for u in days_unused:
                if u in days_used:
                    days_used.remove(u)
            num_good_days.append(len(days_used))


            # calculate and store difference values between each possible day pairs in a list    
            # Graph Data
            df_graphs = pd.DataFrame({'x': range(0,1440)})
            df_graphs_normal = pd.DataFrame({'x': range(0,1440)})
            df_graphs_raw = pd.DataFrame({'x': range(0,1440)})

            for j in range(len(days_used)):
                k = j + 1                
                while k < len(days_used):
                
                    behavior_user_cumu_day1 = behavior_user_cumu[days_used[j].strftime("%Y-%m-%d")]
                    behavior_user_cumu_day2 = behavior_user_cumu[days_used[k].strftime("%Y-%m-%d")]
                    behavior_user_cumu_normal_day1 = behavior_user_cumu_normal[days_used[j].strftime("%Y-%m-%d")]
                    behavior_user_cumu_normal_day2 = behavior_user_cumu_normal[days_used[k].strftime("%Y-%m-%d")]
                    behavior_user_raw_day1 = behavior_user_raw[days_used[j].strftime("%Y-%m-%d")]
                    behavior_user_raw_day2 = behavior_user_raw[days_used[k].strftime("%Y-%m-%d")]
                    # drop time and reset the index the same
                    behavior_user_cumu_day1 = behavior_user_cumu_day1.reset_index()
                    behavior_user_cumu_day2 = behavior_user_cumu_day2.reset_index()
                    behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1.reset_index()
                    behavior_user_cumu_normal_day2 = behavior_user_cumu_normal_day2.reset_index()
                    behavior_user_raw_day1 = behavior_user_raw_day1.reset_index()
                    behavior_user_raw_day2 = behavior_user_raw_day2.reset_index()                    
                    behavior_user_cumu_day1 = behavior_user_cumu_day1[behavior_name + '_cumu']
                    behavior_user_cumu_day2 = behavior_user_cumu_day2[behavior_name + '_cumu']
                    behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1[behavior_name + '_cumu_normal']
                    behavior_user_cumu_normal_day2 = behavior_user_cumu_normal_day2[behavior_name + '_cumu_normal']
                    behavior_user_raw_day1 = behavior_user_raw_day1[behavior_name]
                    behavior_user_raw_day2 = behavior_user_raw_day2[behavior_name]
                    
                    diff_mean.append(get_distance(behavior_user_cumu_day1,behavior_user_cumu_day2,'mean'))
                    diff_normal_mean.append(get_distance(behavior_user_cumu_normal_day1,behavior_user_cumu_normal_day2,'mean'))
                    diff_raw_mean.append(get_distance(behavior_user_raw_day1,behavior_user_raw_day2,'mean'))
                    k = k + 1

            diff_mean = pd.Series(diff_mean)
            diff_normal_mean = pd.Series(diff_normal_mean)
            diff_raw_mean = pd.Series(diff_raw_mean)
            
            # accumulate values of each ema day for this user   
            absolute_cumu_med_mean_diff.append(diff_mean.median())
            normal_cumu_med_mean_diff.append(1.0-diff_normal_mean.median())
            raw_med_mean_diff.append(diff_raw_mean.median())

            for j in range(len(days_used)):
                behavior_user_cumu_day1 = behavior_user_cumu[days_used[j].strftime("%Y-%m-%d")] 
                behavior_user_cumu_normal_day1 = behavior_user_cumu_normal[days_used[j].strftime("%Y-%m-%d")]
                behavior_user_raw_day1 = behavior_user_raw[days_used[j].strftime("%Y-%m-%d")]
                # drop time and reset the index the same
                behavior_user_cumu_day1 = behavior_user_cumu_day1.reset_index()
                behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1.reset_index()
                behavior_user_raw_day1 = behavior_user_raw_day1.reset_index()
                behavior_user_cumu_day1 = behavior_user_cumu_day1[behavior_name + '_cumu'] 
                behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1[behavior_name + '_cumu_normal']
                behavior_user_raw_day1 = behavior_user_raw_day1[behavior_name]
                df_graphs[days_used[j].strftime("%Y-%m-%d")] = behavior_user_cumu_day1
                df_graphs_normal[days_used[j].strftime("%Y-%m-%d")] = behavior_user_cumu_normal_day1
                df_graphs_raw[days_used[j].strftime("%Y-%m-%d")] = behavior_user_raw_day1
                behavior_amount.append(behavior_user_cumu_day1.max())
            behavior_amount = pd.Series(behavior_amount)
            behavior_mean.append(behavior_amount.mean())
            behavior_std.append(behavior_amount.std())
            
            if len(df_graphs.columns) > 1:

                T10 = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 
                       'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
                line_styles = ['-', '--']
                
                # absolute    
                plt.figure()
                palette = plt.get_cmap('Set1')
                num=0
                line_styles_idx = 0
                for column in df_graphs.drop('x', axis=1):
                    if num >= len(T10):
                        num = 0
                    plt.plot(df_graphs['x'], df_graphs[column], marker='', color=T10[num], linewidth=1, 
                             alpha=0.9, label=column)
                    num+=1
                    line_styles_idx+=1
                plt.legend(loc=2, ncol=2)
                plt.title('cumulative duration')
                plt.xlabel("Time (hours)")

                plt.xticks(np.arange(0, 1441, step=180), ('0', '3', '6', '9', '12', '15', '18', '21', '24'))
                plt.ylabel(behavior_name +' duration')
                if not os.path.exists('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name):
                    os.makedirs('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name)
                plt.savefig('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED) + '/' + behavior_name + '/' + str(user) + '_' + this_ema_day.strftime("%Y-%m-%d") + 
                            '_' + str(diff_mean.median()) + '.pdf', bbox_inches='tight')
                plt.close()

                
                # normal
                plt.figure()
                num=0
                line_styles_idx = 0
                for column in df_graphs_normal.drop('x', axis=1):
                    if num >= len(T10):
                        num = 0
                    plt.plot(df_graphs_normal['x'], df_graphs_normal[column], marker='', color=T10[num], 
                             linewidth=1, alpha=0.9, label=column)
                    num+=1
                    line_styles_idx+=1
                plt.legend(loc=2, ncol=2)
    #             plt.title('normalized cumulative duration')
                plt.xlabel("Time (hours)")
                plt.xticks(np.arange(0, 1441, step=180), ('0', '3', '6', '9', '12', '15', '18', '21', '24'))
                plt.ylabel('Normalized cumulative call duration')

                if not os.path.exists('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_normal'):
                    os.makedirs('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_normal')
                plt.savefig('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED) + '/' + behavior_name + '_normal/' + str(user) + '_' + this_ema_day.strftime("%Y-%m-%d") + 
                            '_normal_' + str(1.0-diff_normal_mean.median()) +'.pdf', bbox_inches='tight')
                plt.close()

                # raw    
                plt.figure()
                palette = plt.get_cmap('Set1')
                num=0
                line_styles_idx = 0
                for column in df_graphs_raw.drop('x', axis=1):
                    if num >= len(T10):
                        num = 0
                    plt.plot(df_graphs_raw['x'], df_graphs_raw[column], marker='', color=T10[num], 
                             linewidth=1, alpha=0.9, label=column)
                    num+=1
                    line_styles_idx+=1
                plt.legend(loc=2, ncol=2)
    #             plt.title('raw time series')
                plt.xlabel("Time (hours)")
                plt.xticks(np.arange(0, 1441, step=180), ('0', '3', '6', '9', '12', '15', '18', '21', '24'))
                plt.ylabel('Raw call duration')

                if not os.path.exists('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_raw'):
                    os.makedirs('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_raw')
                plt.savefig('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED) + '/' + behavior_name + '_raw/' + str(user) + '_' + this_ema_day.strftime("%Y-%m-%d") + 
                            '_raw_' + str(diff_raw_mean.median()) + '.pdf', bbox_inches='tight')
                plt.close()
            
        # put everything into the user's table       
        table_user[behavior_name +'_med_mean'] = absolute_cumu_med_mean_diff
        table_user[behavior_name +'_med_mean_normal'] = normal_cumu_med_mean_diff
        table_user[behavior_name +'_med_mean_raw'] = raw_med_mean_diff
        table_user[behavior_name + '_avg'] = behavior_mean
        table_user[behavior_name + '_std'] = behavior_std  
        table_user[behavior_name + '_num_good_days'] = num_good_days
#         table_user['ema_score'] = table_user['ema_neg_score'] - table_user['ema_pos_score']
        table_all_behav = pd.concat([table_all_behav,table_user])
        print('finished user: ' + str(user))
        print(datetime.datetime.now()) 
    return table_all_behav

In [10]:
behaviors = [
             act,
             act,
             act,
             act,

             audio,
             audio,
             audio,
             audio
            ]
behavior_names = [
                  'stationary',
                  'walking',
                  'running',
                  'activity_unknown',

                  'silence',
                  'voice',
                  'noise',
                  'audio_unknown'
]

In [11]:
print('the training window is:', TRAINING_WINDOW)
print('blocking previous ' + str(BLOCK_WINDOW) + ' days')

for i in range(len(behaviors)):
    behav_table = compute_stability_index(behaviors[i], behavior_names[i], TRAINING_WINDOW, BLOCK_WINDOW, 19)
    behav_table.to_csv(r'table_ema_' + str(TRAINING_WINDOW) + '_' + str(BLOCK_WINDOW) + '_'+ str(i) + '.csv')
    print('finished behavior ' + str(i))
    print()
    print()

the training window is: 14
blocking previous 0 days


In [12]:
def get_full_table(num_days_used, num_days_blocked):
    table0 = pd.read_csv('table_ema_'+str(num_days_used)+'_'+str(num_days_blocked)+'_0.csv')
    table0 =  table0.drop(['Unnamed: 0'], axis=1)
    for i in range(len(behavior_names)-1):
        new_table = pd.read_csv('table_ema_'+str(num_days_used)+'_'+str(num_days_blocked)+'_'+str(i+1)+'.csv')
        col_index = [c for c in new_table.columns if behavior_names[i+1] in c]
        table0 = pd.concat([table0, new_table[col_index]], axis=1)
    return table0

table0 = get_full_table(TRAINING_WINDOW,BLOCK_WINDOW)

table0.to_csv(r'table_ema_'+str(TRAINING_WINDOW)+'_'+str(BLOCK_WINDOW)+'.csv', index=False)

In [13]:
table0.head(5)

Unnamed: 0,user,date,stress_level,stationary_med_mean,stationary_med_mean_normal,stationary_med_mean_raw,stationary_avg,stationary_std,stationary_num_good_days,walking_med_mean,walking_med_mean_normal,walking_med_mean_raw,walking_avg,walking_std,walking_num_good_days,running_med_mean,running_med_mean_normal,running_med_mean_raw,running_avg,running_std,running_num_good_days,activity_unknown_med_mean,activity_unknown_med_mean_normal,activity_unknown_med_mean_raw,activity_unknown_avg,activity_unknown_std,activity_unknown_num_good_days,silence_med_mean,silence_med_mean_normal,silence_med_mean_raw,silence_avg,silence_std,silence_num_good_days,voice_med_mean,voice_med_mean_normal,voice_med_mean_raw,voice_avg,voice_std,voice_num_good_days,noise_med_mean,noise_med_mean_normal,noise_med_mean_raw,noise_avg,noise_std,noise_num_good_days,audio_unknown_med_mean,audio_unknown_med_mean_normal,audio_unknown_med_mean_raw,audio_unknown_avg,audio_unknown_std,audio_unknown_num_good_days
0,0,2013-03-26,2.0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0
1,0,2013-03-27,1.0,,,,58890.0,,1,,,,7906.0,,1,,,,3383.0,,1,,,,1820.0,,1,,,,42343.0,,1,,,,16015.0,,1,,,,13501.0,,1,,,,0.0,,1
2,0,2013-03-28,1.0,12680.482639,0.930577,21.286111,68025.0,12918.840892,2,814.039583,0.931265,7.677778,6376.0,2163.74675,2,905.755556,0.593291,3.665278,2834.0,776.403246,2,264.8375,0.894992,2.469444,1899.0,111.722871,2,13085.585417,0.939034,24.665278,52127.0,13836.665494,2,1405.73125,0.920822,13.280556,15553.0,653.366666,2,902.088889,0.862337,11.88125,11449.5,2901.259123,2,0.0,1.0,0.0,0.0,0.0,2
3,0,2013-03-29,2.857143,10305.543056,0.930577,18.730556,68879.333333,9254.07377,3,1676.808333,0.931265,9.232639,7893.666667,3041.518754,3,481.836806,0.641473,3.238194,2358.0,990.519561,3,471.355556,0.895932,2.6875,2359.333333,801.224896,3,10197.5625,0.950237,24.665278,49931.666667,10496.910085,3,3700.772917,0.869487,13.280556,16982.666667,2518.984782,3,6442.805556,0.862337,14.159028,14602.333333,5833.50035,3,0.0,1.0,0.0,0.0,0.0,3
4,0,2013-03-30,3.0,6964.770833,0.959737,15.157639,71750.5,9490.327339,4,1827.773611,0.85017,8.455208,6857.5,3234.468993,4,495.382292,0.670252,2.619444,1886.0,1243.069588,4,573.293056,0.84221,2.580556,2186.5,739.905174,4,9329.437847,0.94924,22.194444,51580.75,9183.389947,4,3273.903819,0.859816,14.136458,16161.0,2632.628724,4,3863.595139,0.866609,14.256944,14977.75,4821.84974,4,0.0,1.0,0.0,0.0,0.0,4


# Correlation

In [14]:
table0 = pd.read_csv('table_ema_'+str(TRAINING_WINDOW)+'_'+str(BLOCK_WINDOW)+'.csv')

In [15]:
print('length before filtering: ', len(table0))
table0 = get_data.get_filtered_ema_from_table(7, table0)
print('length after filtering: ', len(table0))
print('number of users: ', len(table0['user'].unique()))

length before filtering:  1256
length after filtering:  1036
number of users:  46


In [16]:
corr_table = table0.corr(method ='pearson') 
row_index = [r for r in corr_table.index if "med_mean_normal" in r]
col_index = [c for c in corr_table.columns if 'level' in c]
corr_table_small = corr_table.loc[row_index, col_index]
p_table = get_data.get_p_values(table0)
p_table_small = p_table.loc[row_index, col_index]
corr_table_small['p_value'] = p_table_small['stress_level']
corr_table_small = corr_table_small.sort_values(['stress_level'], ascending=False)
corr_table_small = corr_table_small.drop('audio_unknown_med_mean_normal')
names = corr_table_small.index
df_new = corr_table_small
for name in names:
    df_new = df_new.rename(index={name: name[:-15] + 'stability_index'})
df_new



Unnamed: 0,stress_level,p_value
activity_unknown_stability_index,0.069692,0.024884
walking_stability_index,0.005937,0.848624
voice_stability_index,0.005786,0.852425
noise_stability_index,0.002229,0.942871
running_stability_index,-0.070093,0.024063
stationary_stability_index,-0.114894,0.000211
silence_stability_index,-0.134333,1.4e-05
