In [1]:
import pandas as pd
import numpy as np

In [2]:
def show_full_data(data, row_size=None, column_size=None, col_width=-1):
    """Shows all rows and columns instead of showing only some part and hiding other parts for large data.
    """
    with pd.option_context('display.max_rows', row_size, 'display.max_columns', column_size, 'display.max_colwidth', col_width):
        display(data)

In [3]:
def resample_aggregations(columns):
    # Function to assign aggregation method during resampling
    agg_dict = {}
    for i in columns:
        if 'level' in i:
            agg_dict[i] = np.mean
        elif 'total' in i:
            agg_dict[i] = np.max
        else:
            agg_dict[i] = np.sum
    return agg_dict

In [4]:
df = pd.read_csv('prepared_user_data_seconds/u00_data.csv')

In [5]:
show_full_data(df.head())

Unnamed: 0,timestamp,hour_of_day,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked,sms,call_log,call_duration,deadlines,running_apps,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
0,2013-03-27 04:00:01,4.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1,
1,2013-03-27 04:00:02,4.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1,
2,2013-03-27 04:00:03,4.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1,
3,2013-03-27 04:00:04,4.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1,
4,2013-03-27 04:00:05,4.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1,


In [6]:
labels = df.loc[df.STRESSED.notnull(), ['timestamp', 'STRESSED']]
labels = labels.set_index('timestamp')
labels.index = pd.to_datetime(labels.index)

In [7]:
df_chosen = df.drop(columns=['hour_of_day', 'STRESSED'])

In [8]:
show_full_data(df_chosen.head())

Unnamed: 0,timestamp,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked,sms,call_log,call_duration,deadlines,running_apps,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3
0,2013-03-27 04:00:01,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1
1,2013-03-27 04:00:02,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1
2,2013-03-27 04:00:03,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1
3,2013-03-27 04:00:04,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1
4,2013-03-27 04:00:05,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,0,,1,0,0,0,0,0,0,1


In [9]:
def resample_data(df, labels, res_range='10min'):
    df = df.set_index('timestamp')
    df.index = pd.to_datetime(df.index)
    res_aggs = resample_aggregations(list(df.columns))
    df = df.resample(res_range).agg(res_aggs)
    df = pd.merge_asof(df, labels, left_index=True, right_index=True, tolerance=pd.Timedelta(res_range))
    return df

In [20]:
a = resample_data(df_chosen, labels, res_range='10min')

In [21]:
show_full_data(a.head())

Unnamed: 0_level_0,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked,sms,call_log,call_duration,deadlines,running_apps,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2013-03-27 04:00:00,0.0,,,,,,,,-77.0,18.384776,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,12.0,599,0,0,0,274,0,185,140,
2013-03-27 04:10:00,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,600,0,0,0,557,1,42,0,
2013-03-27 04:20:00,0.0,,,,,,,,-71.5,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,14.0,600,0,0,0,457,0,143,0,
2013-03-27 04:30:00,0.0,,,,,,,,,,,,,,0.0,315.0,208.0,0.0,0.0,0.0,0,0.0,600,0,0,0,564,1,35,0,
2013-03-27 04:40:00,200.0,,,,,,,,-62.0,,1.0,0.0,1.0,0.0,0.0,600.0,600.0,1.0,0.0,0.0,0,15.0,600,0,0,0,127,94,379,0,


In [22]:
a[a.STRESSED.notnull()].shape

(76, 31)

In [23]:
df = a

In [53]:
uza = pd.read_csv('combined_samples/combined_data_all_30min.csv')

In [54]:
uza.shape

(840181, 34)

In [55]:
show_full_data(uza.head(2))

Unnamed: 0.1,Unnamed: 0,timestamp,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked,sms,call_log,call_duration,deadlines,running_apps,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED,hour_of_day
0,0,2013-03-27 04:00:00,0.0,,,,,,,,-77.0,18.384776,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,599,0,0,0,274,0,185,140.0,,4
1,1,2013-03-27 04:10:00,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600,0,0,0,557,1,42,0.0,,4


In [56]:
uza[uza.STRESSED.notnull()].shape

(9362, 34)

In [57]:
uza[uza.STRESSED == 1].shape

(6437, 34)

In [58]:
uza[uza.STRESSED == 0].shape

(2925, 34)

In [24]:
# Extract indexes of labeled rows.
label_indexes = list(df[df.STRESSED.notnull()].index)

In [26]:
pd.Timedelta('2h')

Timedelta('0 days 02:00:00')

In [30]:
label_indexes[1] - label_indexes[0] > pd.Timedelta('2h')

True