In [45]:
import json 
import pandas as pd
import os
import numpy as np
import datetime
from datetime import timedelta
import matplotlib
matplotlib.use('Agg')
# %matplotlib inline
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# display all columns and rows
pd.set_option('display.max_colwidth',-1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# prepare ema data

## stress

In [3]:
def get_daily_stress(file):
    '''get daily stress of the given individual, calculate the mean stress level of each day'''
    
    # get user ID
    user = file[-2:]
    
    # convert the json file into dataframe
    with open(DIRECTORY+'/' +str(file)+'.json') as f:
        data = json.load(f)
    
    stress_dict = {}
    stress_dict['stress_level'] = []
    stress_dict['location'] = []
    stress_dict['time'] = []
    
    for d in data:
        if ('null' not in d and 'level' in d):
            stress_dict['stress_level'].append(int(d['level']))
            stress_dict['location'].append(d['location'])
            stress_dict['time'].append(datetime.datetime.fromtimestamp(d['resp_time']).isoformat().replace('T',' '))
    
    stress_df = pd.DataFrame.from_dict(stress_dict)
    stress_df = stress_df.sort_values(by=['time'])
    if len(stress_df) > 0:
        stress_df['time'] = stress_df['time'].apply(pd.Timestamp)
        stress_df['date'] = stress_df.apply (lambda row: calculate_date(row),axis=1)
        stress_df = stress_df.drop(['location','time'], axis=1)
        stress_df = stress_df.groupby(['date']).mean()
        stress_df.reset_index(level=0, inplace=True)
        stress_df['user'] = user
        stress_df = stress_df.reindex(columns=['user', 'date', 'stress_level'])
        if not os.path.exists('daily_stress'):
            os.makedirs('daily_stress')
        stress_df.to_csv ('daily_stress/daily_stress_u' + user +'.csv', index = False)

def calculate_date (row):
    return str(row['time'])[0:10]

DIRECTORY = 'dataset/EMA/response/Stress'
for file in sorted(os.listdir(DIRECTORY)):
    if file[0] != '.':
        print(file)

        stress_file = file[0:10]
        get_daily_stress(stress_file)

In [16]:
# concat all users' tables
count = 1
stress_all = pd.DataFrame(['user', 'date', 'stress_level'])
for file in sorted(os.listdir('daily_stress')):
    if file[0] != '.':
        if count == 1:
            stress_all = pd.read_csv('daily_stress/'+file)
        else:
            stress_user = pd.read_csv('daily_stress/'+file)
            stress_all = pd.concat([stress_all, stress_user])
        count += 1
stress_all['date'] = pd.to_datetime(stress_all['date'])
stress_all.head()

Unnamed: 0,user,date,stress_level
0,0,2013-03-26,2.0
1,0,2013-03-27,1.0
2,0,2013-03-28,1.0
3,0,2013-03-29,2.857143
4,0,2013-03-30,3.0


In [10]:
stress_all['user'].unique()

array([ 0,  1,  2,  3,  4,  5,  7,  8,  9, 10, 12, 14, 15, 16, 17, 18, 19,
       20, 22, 23, 24, 25, 27, 30, 31, 32, 33, 34, 35, 36, 39, 41, 42, 43,
       44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59])

# prepare sensing data

## get activity and audio

In [84]:
def get_daily_cumu_df(df, isNormalized):
    daily_df = df
    for i in range(len(daily_df) - 1):
        daily_df.iloc[i + 1] = daily_df.iloc[i + 1] + daily_df.iloc[i]
    if isNormalized:
        max_val = daily_df.iloc[len(daily_df) - 1]
        if max_val != 0:
            daily_df = daily_df.astype('float64')
            for i in range(len(daily_df)):
                daily_df[i] = np.float64(daily_df[i]).item() / np.float64(max_val).item()
    daily_df = daily_df.sort_index()
    return daily_df

def get_sensing_table(behavior_index, good_day_threshold):
    # only look at users with EMA data
    behavior_table_all = pd.DataFrame()
    filtered_count = 0
    all_data_points = 0
    for file in sorted(os.listdir('daily_stress')):
        if file[0] != '.':
            user_id = file[-6:-4]
            print()
            print(user_id)
            behavior_user = pd.read_csv(sensing_path[behavior_index] + user_id + '.csv')
            behavior_user.columns = ['timestamp', 'inference']
            behavior_user['time'] = pd.to_datetime(behavior_user.timestamp, unit='s')
            behavior_user = behavior_user.set_index('time')
            behavior_user = behavior_user.loc[~behavior_user.index.duplicated(keep='first')]
            behavior_user = behavior_user.sort_index()

            # get all days for this user
            dates_df = behavior_user.resample('D').count() 
            days = dates_df.index

            for day in days:
                print('day',day)
                df_user_day = pd.DataFrame(columns = sensing_types[behavior_index])
                behavior_user_day = behavior_user[day.strftime("%Y-%m-%d")]
                behavior_user_day = behavior_user_day.resample('S').pad()
                day_minutes = day + pd.to_timedelta(np.arange(1440), 'm')
                available_counts_0 = []
                available_counts_1 = []
                available_counts_2 = []
                available_counts_3 = []
                available_counts = [available_counts_0, available_counts_1, available_counts_2, available_counts_3]
                available_minutes = behavior_user_day.resample('T').count().index
                for minute in available_minutes:
                    behavior_user_minute = behavior_user_day[minute.strftime("%Y-%m-%d %H:%M")]
                    behavior_user_minute_grouped = behavior_user_minute.groupby('inference').count()
                    for i in range(len(sensing_types[behavior_index])):
                        if i in behavior_user_minute_grouped.index:
                            available_counts[i].append(behavior_user_minute_grouped.loc[i]['timestamp'])
                        else:
                            available_counts[i].append(0)
                df_user_day['time'] = available_minutes
                for i in range(len(sensing_types[behavior_index])): 
                    df_user_day[sensing_types[behavior_index][i]] = available_counts[i]
                df_user_day = df_user_day.set_index('time')
                df_user_day = df_user_day.reindex(day_minutes, fill_value=0)
                for i in range(len(sensing_types[behavior_index])):
                    df_user_day[sensing_types[behavior_index][i] + '_cumu'] = get_daily_cumu_df(df_user_day[sensing_types[behavior_index][i]].copy(), False)
                    df_user_day[sensing_types[behavior_index][i] + '_cumu_normal'] = get_daily_cumu_df(df_user_day[sensing_types[behavior_index][i]].copy(), True)

                # check if this day has more than 19 hours of sensing data
                total_sensing = 0
                for i in range(len(sensing_types[behavior_index])):
                    total_sensing += df_user_day.iloc[len(df_user_day)-1][sensing_types[behavior_index][i] + '_cumu']
                if total_sensing >= good_day_threshold*60*60:
                    df_user_day = df_user_day.reset_index()
                    df_user_day['time'] = df_user_day['index']
                    df_user_day = df_user_day.drop(['index'], axis=1)
                    df_user_day['user'] = user_id
                    behavior_table_all = pd.concat([behavior_table_all, df_user_day])
                else:
                    filtered_count +=1
                all_data_points+=1
            print('filter', filtered_count)
            print(all_data_points)
            print(datetime.datetime.now()) 
    behavior_table_all.to_csv(behavior_names[behavior_index] + '.csv', index = False)

In [6]:
sensing_path = ['dataset/sensing/activity/activity_u',
                'dataset/sensing/audio/audio_u'
               ]
sensing_types = [['stationary', 'walking', 'running', 'activity_unknown'], 
                 ['silence', 'voice', 'noise', 'audio_unknown']
                ]
behavior_names = ['activity', 'audio']
# # 0	Stationary
# # 1	Walking
# # 2	Running
# # 3	Unknown

get_sensing_table(0,19)
get_sensing_table(1,19)

# compute Stability Index

In [130]:
TRAINING_WINDOW = 14
BLOCK_WINDOW = 0

def get_table(csv_file):    
#     csv_file = csv_file.drop(['Unnamed: 0'], axis=1)
    csv_file['time'] = pd.to_datetime(csv_file['time'])
    return csv_file

def get_distance(ts1, ts2, mode):
    diff = ts1 - ts2        
    diff = diff.abs()    
    if 'mean' in mode:
        return diff.mean()
    elif 'median' in mode or 'med' in mode:
        return diff.median()
    elif 'max' in mode:
        return diff.max()
    else:
        print('invalid mode: 0: mean; 1: median; 2: max')
        return None
    
# get all processed feature tables
print(datetime.datetime.now())

act = get_table(pd.read_csv('activity.csv'))
audio = get_table(pd.read_csv('audio.csv'))
print(datetime.datetime.now())

2020-08-09 15:31:50.654267
2020-08-09 15:32:08.793721


In [133]:
def compute_stability_index(behavior, behavior_name, NUM_DAYS_USED, NUM_DAYS_BLOCKED, FILTERING):
    table_all_behav = pd.DataFrame()
    for user in stress_all['user'].unique():
#         print('current user: ' + user)
        table_user = stress_all.loc[stress_all.user == user].copy()
#         ema_user = ema_user.dropna()
#         table_user = ema_user
        behavior_user = behavior.loc[behavior.user == user]
        behavior_user = behavior_user.set_index('time')
        behavior_user = behavior_user.sort_index()
        behavior_user = behavior_user.reset_index()  
        
        # set of features       
        normal_cumu_med_mean_diff = []
        absolute_cumu_med_mean_diff = []
        raw_med_mean_diff = []
        
        behavior_mean = []
        behavior_std = []
        num_good_days = []
        
        # pre processing normal, absolute, raw
        behavior_user_cumu = behavior_user[['time', behavior_name + '_cumu']]
        behavior_user_cumu_normal = behavior_user[['time', behavior_name + '_cumu_normal']]
        behavior_user_raw = behavior_user[['time', behavior_name]]
        
        behavior_user_cumu = behavior_user_cumu.set_index('time')
        behavior_user_cumu_normal = behavior_user_cumu_normal.set_index('time')
        behavior_user_raw = behavior_user_raw.set_index('time')

        # get the list of available days in this behavior data for this user
        user_days = []
#         user_days_filtered = []
        for date in behavior_user_cumu.index:
            timestampStr = date.strftime("%Y-%m-%d")
            if timestampStr not in user_days:
                user_days.append(timestampStr)
        for u_day in range(len(user_days)):
            user_days[u_day] = pd.to_datetime(user_days[u_day])
                
        # compute feature for each ema day
        for i in range(len(table_user['date'])):
            diff_mean = []
            diff_normal_mean = []
            behavior_amount = []
            diff_raw_mean = []
            
            this_ema_day = table_user.iloc[i]['date']
#             print()
#             print('new ema day: ', this_ema_day.strftime("%Y-%m-%d"))
            starting_day = this_ema_day - timedelta(days=NUM_DAYS_USED + NUM_DAYS_BLOCKED - 1)
            if NUM_DAYS_BLOCKED > 0:
                days_unused = this_ema_day - pd.to_timedelta(np.arange(NUM_DAYS_BLOCKED), 'd')
                days_used = [u_f for u_f in user_days if u_f < this_ema_day and u_f >= starting_day]
            else:
                days_unused = []
                days_used = [u_f for u_f in user_days if u_f <= this_ema_day and u_f >= starting_day]
            for u in days_unused:
                if u in days_used:
                    days_used.remove(u)
            num_good_days.append(len(days_used))
#             print(days_used)


            # calculate and store difference values between each possible day pairs in a list    
            # Graph Data
            df_graphs = pd.DataFrame({'x': range(0,1440)})
            df_graphs_normal = pd.DataFrame({'x': range(0,1440)})
            df_graphs_raw = pd.DataFrame({'x': range(0,1440)})

            for j in range(len(days_used)):
                k = j + 1                
                while k < len(days_used):
                
                    behavior_user_cumu_day1 = behavior_user_cumu[days_used[j].strftime("%Y-%m-%d")]
                    behavior_user_cumu_day2 = behavior_user_cumu[days_used[k].strftime("%Y-%m-%d")]
                    behavior_user_cumu_normal_day1 = behavior_user_cumu_normal[days_used[j].strftime("%Y-%m-%d")]
                    behavior_user_cumu_normal_day2 = behavior_user_cumu_normal[days_used[k].strftime("%Y-%m-%d")]
                    behavior_user_raw_day1 = behavior_user_raw[days_used[j].strftime("%Y-%m-%d")]
                    behavior_user_raw_day2 = behavior_user_raw[days_used[k].strftime("%Y-%m-%d")]
                    # drop time and reset the index the same
                    behavior_user_cumu_day1 = behavior_user_cumu_day1.reset_index()
                    behavior_user_cumu_day2 = behavior_user_cumu_day2.reset_index()
                    behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1.reset_index()
                    behavior_user_cumu_normal_day2 = behavior_user_cumu_normal_day2.reset_index()
                    behavior_user_raw_day1 = behavior_user_raw_day1.reset_index()
                    behavior_user_raw_day2 = behavior_user_raw_day2.reset_index()                    
                    behavior_user_cumu_day1 = behavior_user_cumu_day1[behavior_name + '_cumu']
                    behavior_user_cumu_day2 = behavior_user_cumu_day2[behavior_name + '_cumu']
                    behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1[behavior_name + '_cumu_normal']
                    behavior_user_cumu_normal_day2 = behavior_user_cumu_normal_day2[behavior_name + '_cumu_normal']
                    behavior_user_raw_day1 = behavior_user_raw_day1[behavior_name]
                    behavior_user_raw_day2 = behavior_user_raw_day2[behavior_name]
                    
                    diff_mean.append(get_distance(behavior_user_cumu_day1,behavior_user_cumu_day2,'mean'))
                    diff_normal_mean.append(get_distance(behavior_user_cumu_normal_day1,behavior_user_cumu_normal_day2,'mean'))
                    diff_raw_mean.append(get_distance(behavior_user_raw_day1,behavior_user_raw_day2,'mean'))
                    k = k + 1

            diff_mean = pd.Series(diff_mean)
            diff_normal_mean = pd.Series(diff_normal_mean)
            diff_raw_mean = pd.Series(diff_raw_mean)
            
            # accumulate values of each ema day for this user   
            absolute_cumu_med_mean_diff.append(diff_mean.median())
            normal_cumu_med_mean_diff.append(1.0-diff_normal_mean.median())
            raw_med_mean_diff.append(diff_raw_mean.median())

            for j in range(len(days_used)):
                behavior_user_cumu_day1 = behavior_user_cumu[days_used[j].strftime("%Y-%m-%d")] 
                behavior_user_cumu_normal_day1 = behavior_user_cumu_normal[days_used[j].strftime("%Y-%m-%d")]
                behavior_user_raw_day1 = behavior_user_raw[days_used[j].strftime("%Y-%m-%d")]
                # drop time and reset the index the same
                behavior_user_cumu_day1 = behavior_user_cumu_day1.reset_index()
                behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1.reset_index()
                behavior_user_raw_day1 = behavior_user_raw_day1.reset_index()
                behavior_user_cumu_day1 = behavior_user_cumu_day1[behavior_name + '_cumu'] 
                behavior_user_cumu_normal_day1 = behavior_user_cumu_normal_day1[behavior_name + '_cumu_normal']
                behavior_user_raw_day1 = behavior_user_raw_day1[behavior_name]
                df_graphs[days_used[j].strftime("%Y-%m-%d")] = behavior_user_cumu_day1
                df_graphs_normal[days_used[j].strftime("%Y-%m-%d")] = behavior_user_cumu_normal_day1
                df_graphs_raw[days_used[j].strftime("%Y-%m-%d")] = behavior_user_raw_day1
                behavior_amount.append(behavior_user_cumu_day1.max())
            behavior_amount = pd.Series(behavior_amount)
            behavior_mean.append(behavior_amount.mean())
            behavior_std.append(behavior_amount.std())
            
            if len(df_graphs.columns) > 1:

                T10 = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 
                       'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
                line_styles = ['-', '--']
                
                # absolute    
                plt.figure()
                palette = plt.get_cmap('Set1')
                num=0
                line_styles_idx = 0
                for column in df_graphs.drop('x', axis=1):
                    if num >= len(T10):
                        num = 0
                    plt.plot(df_graphs['x'], df_graphs[column], marker='', color=T10[num], linewidth=1, 
                             alpha=0.9, label=column)
                    num+=1
                    line_styles_idx+=1
                plt.legend(loc=2, ncol=2)
                plt.title('cumulative duration')
                plt.xlabel("Time (hours)")

                plt.xticks(np.arange(0, 1441, step=180), ('0', '3', '6', '9', '12', '15', '18', '21', '24'))
                plt.ylabel(behavior_name +' duration')
                if not os.path.exists('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name):
                    os.makedirs('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name)
                plt.savefig('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED) + '/' + behavior_name + '/' + str(user) + '_' + this_ema_day.strftime("%Y-%m-%d") + 
                            '_' + str(diff_mean.median()) + '.pdf', bbox_inches='tight')
                plt.close()

                
                # normal
                plt.figure()
                num=0
                line_styles_idx = 0
                for column in df_graphs_normal.drop('x', axis=1):
                    if num >= len(T10):
                        num = 0
                    plt.plot(df_graphs_normal['x'], df_graphs_normal[column], marker='', color=T10[num], 
                             linewidth=1, alpha=0.9, label=column)
                    num+=1
                    line_styles_idx+=1
                plt.legend(loc=2, ncol=2)
    #             plt.title('normalized cumulative duration')
                plt.xlabel("Time (hours)")
                plt.xticks(np.arange(0, 1441, step=180), ('0', '3', '6', '9', '12', '15', '18', '21', '24'))
                plt.ylabel('Normalized cumulative call duration')

                if not os.path.exists('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_normal'):
                    os.makedirs('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_normal')
                plt.savefig('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED) + '/' + behavior_name + '_normal/' + str(user) + '_' + this_ema_day.strftime("%Y-%m-%d") + 
                            '_normal_' + str(1.0-diff_normal_mean.median()) +'.pdf', bbox_inches='tight')
                plt.close()

                # raw    
                plt.figure()
                palette = plt.get_cmap('Set1')
                num=0
                line_styles_idx = 0
                for column in df_graphs_raw.drop('x', axis=1):
                    if num >= len(T10):
                        num = 0
                    plt.plot(df_graphs_raw['x'], df_graphs_raw[column], marker='', color=T10[num], 
                             linewidth=1, alpha=0.9, label=column)
                    num+=1
                    line_styles_idx+=1
                plt.legend(loc=2, ncol=2)
    #             plt.title('raw time series')
                plt.xlabel("Time (hours)")
                plt.xticks(np.arange(0, 1441, step=180), ('0', '3', '6', '9', '12', '15', '18', '21', '24'))
                plt.ylabel('Raw call duration')

                if not os.path.exists('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_raw'):
                    os.makedirs('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED)+ '/' + behavior_name + '_raw')
                plt.savefig('graphs_ema_' + str(NUM_DAYS_USED) + '_' + str(NUM_DAYS_BLOCKED) + '/' + behavior_name + '_raw/' + str(user) + '_' + this_ema_day.strftime("%Y-%m-%d") + 
                            '_raw_' + str(diff_raw_mean.median()) + '.pdf', bbox_inches='tight')
                plt.close()
            
        # put everything into the user's table       
        table_user[behavior_name +'_med_mean'] = absolute_cumu_med_mean_diff
        table_user[behavior_name +'_med_mean_normal'] = normal_cumu_med_mean_diff
        table_user[behavior_name +'_med_mean_raw'] = raw_med_mean_diff
        table_user[behavior_name + '_avg'] = behavior_mean
        table_user[behavior_name + '_std'] = behavior_std  
        table_user[behavior_name + '_num_good_days'] = num_good_days
#         table_user['ema_score'] = table_user['ema_neg_score'] - table_user['ema_pos_score']
        table_all_behav = pd.concat([table_all_behav,table_user])
        print('finished user: ' + str(user))
        print(datetime.datetime.now()) 
    return table_all_behav

In [134]:
behaviors = [

             act,
             act,
             act,
             act,

             audio,
             audio,
             audio,
             audio
            ]
behavior_names = [

                  'stationary',
                  'walking',
                  'running',
                  'activity_unknown',

                  'silence',
                  'voice',
                  'noise',
                  'audio_unknown'
]
print('number of behaviors:', len(behaviors))
print('number of behaviors:', len(behavior_names))
print()
      
print('the training window is:', TRAINING_WINDOW)
print('blocking previous ' + str(BLOCK_WINDOW) + ' days')

for i in range(len(behaviors)):
    behav_table = compute_stability_index(behaviors[i], behavior_names[i], TRAINING_WINDOW, BLOCK_WINDOW, 19)
    behav_table.to_csv(r'table_ema_' + str(TRAINING_WINDOW) + '_' + str(BLOCK_WINDOW) + '_'+ str(i) + '.csv')
    print('finished behavior ' + str(i))
    print()
    print()

number of behaviors: 8
number of behaviors: 8

the training window is: 14
blocking previous 0 days
finished user: 0
2020-08-09 15:34:50.941314
finished user: 1
2020-08-09 15:36:04.750210
finished user: 2
2020-08-09 15:37:20.609007
finished user: 3
2020-08-09 15:38:07.210669
finished user: 4
2020-08-09 15:39:24.873366
finished user: 5
2020-08-09 15:39:30.817442
finished user: 7
2020-08-09 15:40:23.717298
finished user: 8
2020-08-09 15:42:28.178966
finished user: 9
2020-08-09 15:42:33.358788
finished user: 10
2020-08-09 15:44:46.441659
finished user: 12
2020-08-09 15:45:48.472738
finished user: 14
2020-08-09 15:47:03.395001
finished user: 15
2020-08-09 15:47:19.722814
finished user: 16
2020-08-09 15:50:06.724297
finished user: 17
2020-08-09 15:51:59.789838
finished user: 18
2020-08-09 15:52:39.448993
finished user: 19
2020-08-09 15:55:37.742632
finished user: 20
2020-08-09 15:55:56.629469
finished user: 22
2020-08-09 15:57:24.278991
finished user: 23
2020-08-09 15:58:27.928179
finished u

In [135]:
def get_full_table(num_days_used, num_days_blocked):
    table0 = pd.read_csv('table_ema_'+str(num_days_used)+'_'+str(num_days_blocked)+'_0.csv')
    table0 =  table0.drop(['Unnamed: 0'], axis=1)
    for i in range(len(behavior_names)-1):
        new_table = pd.read_csv('table_ema_'+str(num_days_used)+'_'+str(num_days_blocked)+'_'+str(i+1)+'.csv')
        col_index = [c for c in new_table.columns if behavior_names[i+1] in c]
        table0 = pd.concat([table0, new_table[col_index]], axis=1)
    return table0

table0 = get_full_table(TRAINING_WINDOW,BLOCK_WINDOW)

table0.to_csv(r'table_ema_'+str(TRAINING_WINDOW)+'_'+str(BLOCK_WINDOW)+'.csv', index=False)

In [136]:
table0.head(10)

Unnamed: 0,user,date,stress_level,stationary_med_mean,stationary_med_mean_normal,stationary_med_mean_raw,stationary_avg,stationary_std,stationary_num_good_days,walking_med_mean,walking_med_mean_normal,walking_med_mean_raw,walking_avg,walking_std,walking_num_good_days,running_med_mean,running_med_mean_normal,running_med_mean_raw,running_avg,running_std,running_num_good_days,activity_unknown_med_mean,activity_unknown_med_mean_normal,activity_unknown_med_mean_raw,activity_unknown_avg,activity_unknown_std,activity_unknown_num_good_days,silence_med_mean,silence_med_mean_normal,silence_med_mean_raw,silence_avg,silence_std,silence_num_good_days,voice_med_mean,voice_med_mean_normal,voice_med_mean_raw,voice_avg,voice_std,voice_num_good_days,noise_med_mean,noise_med_mean_normal,noise_med_mean_raw,noise_avg,noise_std,noise_num_good_days,audio_unknown_med_mean,audio_unknown_med_mean_normal,audio_unknown_med_mean_raw,audio_unknown_avg,audio_unknown_std,audio_unknown_num_good_days
0,0,2013-03-26,2.0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0,,,,,,0
1,0,2013-03-27,1.0,,,,58890.0,,1,,,,7906.0,,1,,,,3383.0,,1,,,,1820.0,,1,,,,42343.0,,1,,,,16015.0,,1,,,,13501.0,,1,,,,0.0,,1
2,0,2013-03-28,1.0,12680.482639,0.930577,21.286111,68025.0,12918.840892,2,814.039583,0.931265,7.677778,6376.0,2163.74675,2,905.755556,0.593291,3.665278,2834.0,776.403246,2,264.8375,0.894992,2.469444,1899.0,111.722871,2,13085.585417,0.939034,24.665278,52127.0,13836.665494,2,1405.73125,0.920822,13.280556,15553.0,653.366666,2,902.088889,0.862337,11.88125,11449.5,2901.259123,2,0.0,1.0,0.0,0.0,0.0,2
3,0,2013-03-29,2.857143,10305.543056,0.930577,18.730556,68879.333333,9254.07377,3,1676.808333,0.931265,9.232639,7893.666667,3041.518754,3,481.836806,0.641473,3.238194,2358.0,990.519561,3,471.355556,0.895932,2.6875,2359.333333,801.224896,3,10197.5625,0.950237,24.665278,49931.666667,10496.910085,3,3700.772917,0.869487,13.280556,16982.666667,2518.984782,3,6442.805556,0.862337,14.159028,14602.333333,5833.50035,3,0.0,1.0,0.0,0.0,0.0,3
4,0,2013-03-30,3.0,6964.770833,0.959737,15.157639,71750.5,9490.327339,4,1827.773611,0.85017,8.455208,6857.5,3234.468993,4,495.382292,0.670252,2.619444,1886.0,1243.069588,4,573.293056,0.84221,2.580556,2186.5,739.905174,4,9329.437847,0.94924,22.194444,51580.75,9183.389947,4,3273.903819,0.859816,14.136458,16161.0,2632.628724,4,3863.595139,0.866609,14.256944,14977.75,4821.84974,4,0.0,1.0,0.0,0.0,0.0,4
5,0,2013-04-01,4.5,3081.745139,0.973316,12.997917,72090.5,7583.873041,6,1330.917361,0.878177,8.728472,7146.833333,2552.180982,6,580.851389,0.733779,3.176389,2540.5,2166.483487,6,438.880556,0.84142,2.620139,2086.5,595.716627,6,9799.521528,0.940811,19.723611,50324.0,9428.385991,6,3452.002778,0.8598,14.992361,18689.333333,5256.00482,6,3611.841667,0.878705,14.884028,14902.833333,4771.745652,6,0.0,1.0,0.0,0.0,0.0,6
6,0,2013-04-03,1.5,2573.923958,0.980772,12.169097,73053.875,6666.492833,8,1308.339583,0.877894,7.555556,6519.875,2607.676549,8,573.408681,0.706024,2.747569,2489.625,1840.95526,8,442.921528,0.846214,3.015278,2383.125,795.428624,8,6316.473264,0.93633,19.565625,50229.25,7970.374713,8,3017.242708,0.869634,14.955903,18337.5,4757.269805,8,2848.384375,0.89376,14.679514,15947.875,4734.223045,8,0.0,1.0,0.0,0.0,0.0,8
7,0,2013-04-04,2.0,2509.807986,0.98116,12.056944,73331.111111,6291.152487,9,1224.772222,0.889451,7.564931,6434.777778,2452.581027,9,537.446875,0.706024,2.842361,2537.555556,1728.048835,9,436.088542,0.843517,2.838889,2341.666667,754.378884,9,6100.263194,0.939697,19.534028,51693.444444,8653.369418,9,2798.171528,0.885602,14.215972,17456.666667,5175.468119,9,2847.682292,0.89376,14.256944,15574.0,4568.293117,9,0.0,1.0,0.0,0.0,0.0,9
8,0,2013-04-05,2.666667,2215.375694,0.984323,11.861111,73339.7,5931.417632,10,1213.127083,0.892829,7.458333,6478.3,2316.407806,10,785.331944,0.699031,2.815278,2494.5,1634.899334,10,407.874306,0.833401,3.124306,2493.1,857.424107,10,5854.488194,0.945015,19.327083,52114.6,8266.464368,10,2459.353472,0.897375,13.746528,17569.8,4892.575845,10,2551.590972,0.895264,13.511806,15207.3,4460.401279,10,0.0,1.0,0.0,0.0,0.0,10
9,0,2013-04-06,1.0,2164.720833,0.980519,12.171528,73289.0,5629.548739,11,1219.902083,0.879645,7.433333,6361.727273,2231.289273,11,620.993056,0.724871,3.008333,2554.727273,1563.811631,11,442.426389,0.829346,3.451389,2729.818182,1130.508277,11,5854.488194,0.939148,19.557639,52351.0,7881.352955,11,2938.680556,0.86978,14.836111,17484.909091,4650.036526,11,2569.758333,0.88319,13.877083,15179.545455,4232.509312,11,0.0,1.0,0.0,0.0,0.0,11


# Correlation

In [137]:
def get_p_values(df):
    df1 = df.dropna()._get_numeric_data()
    coeffmat = np.zeros((df1.shape[1], df1.shape[1]))
    pvalmat = np.zeros((df1.shape[1], df1.shape[1]))

    for i in range(df1.shape[1]):    
        for j in range(df1.shape[1]):        
            corrtest = pearsonr(df1[df1.columns[i]], df1[df1.columns[j]])  

            coeffmat[i,j] = corrtest[0]
            pvalmat[i,j] = corrtest[1]

    dfcoeff = pd.DataFrame(coeffmat, columns=df1.columns, index=df1.columns)

    dfpvals = pd.DataFrame(pvalmat, columns=df1.columns, index=df1.columns)
    return dfpvals

In [138]:
def get_filtered_ema_from_table(NUM_GOOD_DAYS_REQUIRED, current_ema):
    '''
    only keeps ema data with strictly more than "NUM_GOOD_DAYS_REQUIRED" days of behavioral data 
    for all activities
    '''
    table0 = current_ema

    table0_filtered = table0.loc[table0.stationary_num_good_days >= NUM_GOOD_DAYS_REQUIRED]
    table0_filtered = table0_filtered.loc[table0_filtered.walking_num_good_days >= NUM_GOOD_DAYS_REQUIRED]
    table0_filtered = table0_filtered.loc[table0_filtered.running_num_good_days >= NUM_GOOD_DAYS_REQUIRED]
    table0_filtered = table0_filtered.loc[table0_filtered.activity_unknown_num_good_days >= NUM_GOOD_DAYS_REQUIRED]
    table0_filtered = table0_filtered.loc[table0_filtered.silence_num_good_days >= NUM_GOOD_DAYS_REQUIRED]
    table0_filtered = table0_filtered.loc[table0_filtered.voice_num_good_days >= NUM_GOOD_DAYS_REQUIRED]
    table0_filtered = table0_filtered.loc[table0_filtered.noise_num_good_days >= NUM_GOOD_DAYS_REQUIRED]
    table0_filtered = table0_filtered.loc[table0_filtered.audio_unknown_num_good_days >= NUM_GOOD_DAYS_REQUIRED]

    new_ema = table0_filtered
    return new_ema

In [163]:
table0 = pd.read_csv('table_ema_'+str(TRAINING_WINDOW)+'_'+str(BLOCK_WINDOW)+'.csv')

In [164]:
print('length before filtering: ', len(table0))
table0 = get_filtered_ema_from_table(7, table0)
print('length after filtering: ', len(table0))

length before filtering:  1256
length after filtering:  1036


In [167]:
corr_table = table0.corr(method ='pearson') 
row_index = [r for r in corr_table.index if "med_mean_normal" in r]
col_index = [c for c in corr_table.columns if 'level' in c]
corr_table_small = corr_table.loc[row_index, col_index]
corr_table_small

Unnamed: 0,stress_level
stationary_med_mean_normal,-0.114894
walking_med_mean_normal,0.005937
running_med_mean_normal,-0.070093
activity_unknown_med_mean_normal,0.069692
silence_med_mean_normal,-0.134333
voice_med_mean_normal,0.005786
noise_med_mean_normal,0.002229
audio_unknown_med_mean_normal,


In [168]:
p_table = get_p_values(table0)
p_table_small = p_table.loc[row_index, col_index]
p_table_small

Unnamed: 0,stress_level
stationary_med_mean_normal,0.000211
walking_med_mean_normal,0.848624
running_med_mean_normal,0.024063
activity_unknown_med_mean_normal,0.024884
silence_med_mean_normal,1.4e-05
voice_med_mean_normal,0.852425
noise_med_mean_normal,0.942871
audio_unknown_med_mean_normal,


In [82]:
# 7_0
corr_table = table0.corr(method ='pearson') 
row_index = [r for r in corr_table.index if "med_mean_normal" in r]
col_index = [c for c in corr_table.columns if 'level' in c]
corr_table_small = corr_table.loc[row_index, col_index]
corr_table_small

Unnamed: 0,stress_level
stationary_med_mean_normal,-0.113212
walking_med_mean_normal,-0.007719
running_med_mean_normal,-0.068977
activity_unknown_med_mean_normal,-0.029769
silence_med_mean_normal,-0.118845
voice_med_mean_normal,-0.05366
noise_med_mean_normal,-0.038617
audio_unknown_med_mean_normal,


In [83]:
# 7_0
p_table = get_p_values(table0)
p_table_small = p_table.loc[row_index, col_index]
p_table_small

Unnamed: 0,stress_level
stationary_med_mean_normal,0.000857
walking_med_mean_normal,0.820751
running_med_mean_normal,0.042665
activity_unknown_med_mean_normal,0.382139
silence_med_mean_normal,0.000464
voice_med_mean_normal,0.114997
noise_med_mean_normal,0.25684
audio_unknown_med_mean_normal,
