In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import scipy.stats as stats

In [2]:
def show_full_data(data, row_size=None, column_size=None, col_width=-1):
    """Shows all rows and columns instead of showing only some part and hiding other parts for large data.
    """
    with pd.option_context('display.max_rows', row_size, 'display.max_columns', column_size, 'display.max_colwidth', col_width):
        display(data)

In [3]:
# Dataset directory
dir_loc = '../../../student-life-dataset/dataset/'

In [4]:
# Collecting all student codes from activity folder (which represents all students)
user_codes = []
for x in sorted(os.listdir(dir_loc + 'sensing/activity/')):
    # Chooses the string before "." and after "_"
    user_codes.append(x.split('.')[0].split('_')[1])

# Sensing Data

## Activity

In [5]:
def get_activity(user):
    activity = pd.read_csv(dir_loc + 'sensing/activity/activity_' + user + '.csv')
    activity.columns = ['timestamp', 'activity_inference']
    # make timestamp unique and take the mode for different values of activity inference
    activity = activity.groupby("timestamp")['activity_inference'].apply(lambda x: x.mode()[0]).reset_index()
    activity.timestamp = pd.to_datetime(activity.timestamp, unit='s')
    activity = activity.set_index('timestamp')
    activity = activity.asfreq('s', method='bfill')
    return activity

## Audio

In [6]:
def get_audio(user):
    audio = pd.read_csv(dir_loc + 'sensing/audio/audio_' + user + '.csv')
    audio.columns = ['timestamp', 'audio_inference']
    # make timestamp unique and take the mode for different values of audio inference
    audio = audio.groupby("timestamp")['audio_inference'].apply(lambda x: x.mode()[0]).reset_index()
    audio.timestamp = pd.to_datetime(audio.timestamp, unit='s')
    audio = audio.set_index('timestamp')
    audio = audio.asfreq('s', method='bfill')
    return audio

## Conversation

In [7]:
def get_conversation(user):
    conversation = pd.read_csv(dir_loc + 'sensing/conversation/conversation_' + user + '.csv')
    conversation.columns = ['start_timestamp', 'end_timestamp']
    conversation.start_timestamp = pd.to_datetime(conversation.start_timestamp, unit='s')
    conversation.end_timestamp = pd.to_datetime(conversation.end_timestamp, unit='s')
    return conversation

## GPS

In [8]:
def get_gps(user):
    gps = pd.read_csv(dir_loc + 'sensing/gps/gps_' + user + '.csv', index_col=False)
    gps.time = pd.to_datetime(gps.time, unit='s')
    return gps

## Bluetooth

In [9]:
def get_bluetooth(user):
    bluetooth = pd.read_csv(dir_loc + 'sensing/bluetooth/bt_' + user + '.csv', index_col=False)
    bluetooth.time = pd.to_datetime(bluetooth.time, unit='s')
    return bluetooth
    

## Wi-Fi

In [10]:
def get_wifi(user):
    wifi = pd.read_csv(dir_loc + 'sensing/wifi/wifi_' + user + '.csv', index_col=False)
    wifi.time = pd.to_datetime(wifi.time, unit='s')
    return wifi

## Wi-Fi Location

In [11]:
def get_wifi_loc(user):
    wifi_loc = pd.read_csv(dir_loc + 'sensing/wifi_location/wifi_location_' + user + '.csv', index_col=False)
    wifi_loc.time = pd.to_datetime(wifi_loc.time, unit='s')
    return wifi_loc

## Dark

In [12]:
def get_dark(user):
    dark = pd.read_csv(dir_loc + 'sensing/dark/dark_' + user + '.csv', index_col=False)
    dark.start = pd.to_datetime(dark.start, unit='s')
    dark.end = pd.to_datetime(dark.end, unit='s')
    return dark

## Phone Charge

In [13]:
def get_phone_charge(user):
    phonecharge = pd.read_csv(dir_loc + 'sensing/phonecharge/phonecharge_' + user + '.csv', index_col=False)
    phonecharge.start = pd.to_datetime(phonecharge.start, unit='s')
    phonecharge.end = pd.to_datetime(phonecharge.end, unit='s')
    return phonecharge

## Phone Lock

In [14]:
def get_phone_lock(user):
    phonelock = pd.read_csv(dir_loc + 'sensing/phonelock/phonelock_' + user + '.csv', index_col=False)
    phonelock.start = pd.to_datetime(phonelock.start, unit='s')
    phonelock.end = pd.to_datetime(phonelock.end, unit='s')
    return phonelock

## Merge Sensor Data

In [15]:
u = 'u00'

In [16]:
activity = get_activity(u)
audio = get_audio(u)
conversation = get_conversation(u)
gps = get_gps(u)
bluetooth = get_bluetooth(u)
wifi = get_wifi(u)
wifi_loc = get_wifi_loc(u)
dark = get_dark(u)
phone_charge = get_phone_charge(u)
phone_lock = get_phone_lock(u)

In [17]:
# activity - audio merge
df = pd.merge(activity, audio, left_index=True, right_index=True, how='outer')

# add conversation
df['conversation'] = np.nan
for i in range(conversation.shape[0]):
    start = conversation.iloc[i, 0]
    end = conversation.iloc[i, 1]
    df.loc[(df.index >= start) & (df.index <= end), 'conversation'] = 1
    
# add gps
gps.columns = ['gps_' + i for i in gps.columns]
df = df.reset_index()
df = pd.merge(df, gps, left_on='timestamp', right_on='gps_time', how='outer')
df.drop(columns=['gps_time'], inplace=True)

# add bluetooth
bluetooth_new = pd.DataFrame()
for time in bluetooth.time.unique():
    data = {'timestamp': time}
    item = bluetooth[bluetooth.time == time]
    data['total_devices_around'] = item.shape[0]
    data['total_nearer'] = item[item.level >= -70].shape[0]
    data['total_near'] = item[(item.level >= -80) & (item.level < -70)].shape[0]
    data['total_far'] = item[(item.level >= -90) & (item.level < -80)].shape[0]
    data['total_farther'] = item[(item.level >= -100) & (item.level < -90)].shape[0]
    data['level_avg'] = round(item.level.mean())
    data['level_std'] = item.level.std()
    bluetooth_new = bluetooth_new.append(data, ignore_index=True)

bluetooth_new.columns = ['bt_' + i for i in bluetooth_new.columns]
df = pd.merge(df, bluetooth_new, left_on='timestamp', right_on='bt_timestamp', how='outer')
df.drop(columns=['bt_timestamp'], inplace=True)

# add wifi
wifi_new = pd.DataFrame()
for time in wifi.time.unique():
    data = {'timestamp': time}
    item = wifi[wifi.time == time]
    data['total_devices_around'] = item.shape[0]
    data['total_nearer'] = item[item.level >= -60].shape[0]
    data['total_near'] = item[(item.level >= -80) & (item.level < -60)].shape[0]
    data['total_far'] = item[(item.level >= -100) & (item.level < -80)].shape[0]
    data['level_avg'] = round(item.level.mean())
    data['level_std'] = item.level.std()
    wifi_new = wifi_new.append(data, ignore_index=True)
    
wifi_new.columns = ['wifi_' + i for i in wifi_new.columns]
df = pd.merge(df, wifi_new, left_on='timestamp', right_on='wifi_timestamp', how='outer')
df.drop(columns=['wifi_timestamp'], inplace=True)

# add dark
df['phone_in_dark'] = np.nan
for i in range(dark.shape[0]):
    start = dark.iloc[i, 0]
    end = dark.iloc[i, 1]
    df.loc[(df.timestamp >= start) & (df.timestamp <= end), 'phone_in_dark'] = 1

# phone charge
df['phone_charging'] = np.nan
for i in range(phone_charge.shape[0]):
    start = phone_charge.iloc[i, 0]
    end = phone_charge.iloc[i, 1]
    df.loc[(df.timestamp >= start) & (df.timestamp <= end), 'phone_charging'] = 1
    
# phone locked
df['phone_locked'] = np.nan
for i in range(phone_lock.shape[0]):
    start = phone_lock.iloc[i, 0]
    end = phone_lock.iloc[i, 1]
    df.loc[(df.timestamp >= start) & (df.timestamp <= end), 'phone_locked'] = 1

In [43]:
show_full_data(df.head(5))

Unnamed: 0,timestamp,activity_inference,audio_inference,conversation,gps_provider,gps_network_type,gps_accuracy,gps_latitude,gps_longitude,gps_altitude,gps_bearing,gps_speed,gps_travelstate,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked
0,2013-03-27 04:00:01,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2013-03-27 04:00:02,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2013-03-27 04:00:03,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2013-03-27 04:00:04,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2013-03-27 04:00:05,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [19]:
sensing_data = df.copy()

# EMA

In [20]:
with open(dir_loc + 'EMA/EMA_definition.json') as f:
    definitions = json.load(f)

In [21]:
def ema(user, typ, cols):
    data = pd.read_json(dir_loc + 'EMA/response/' + typ + '/' + typ + '_' + user + '.json')
    if 'null' in data.columns:
        data = data.drop(columns='null')
    if 'location' in data.columns:
        data = data.drop(columns='location')
    data = data.dropna(subset=cols)
    return data

In [22]:
social = ema(u, 'Social', ['number'])
class1 = ema(u, 'Class', ['due', 'experience', 'hours'])
class2 = ema(u, 'Class 2', ['resp_time'])
stress = ema(u, 'Stress', ['level'])
sleep = ema(u, 'Sleep', ['hour'])
activity_ema = ema(u, 'Activity', ['Social2']) # will not be used, because there are a lot of missing data.
behavior = ema(u, 'Behavior', [])
exercise = ema(u, 'Exercise', [])
study_spaces = ema(u, 'Study Spaces', ['place'])
event = ema(u, 'Events', ['nevent', 'pevent'])
mood = ema(u, 'Mood', [])
mood1 = ema(u, 'Mood 1', ['tomorrow'])
mood2 = ema(u, 'Mood 2', ['how'])

In [51]:
org_exercise(ema(u, 'Exercise', []))

Unnamed: 0_level_0,exercise_exercise,exercise_have,exercise_schedule,exercise_walk
resp_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-04-03 03:43:26,4,1.0,,2
2013-04-01 14:55:38,3,2.0,1.0,2
2013-04-01 17:12:56,1,2.0,1.0,1
2013-04-02 04:03:53,1,1.0,,1
2013-04-04 00:28:03,1,2.0,1.0,2
2013-04-04 02:03:44,1,2.0,1.0,3
2013-04-06 02:19:21,1,2.0,1.0,1
2013-04-05 02:39:51,4,1.0,2.0,3
2013-04-05 20:37:51,4,1.0,,3
2013-04-07 18:35:08,3,2.0,2.0,3


In [23]:
def org_social(df):
    df = df.set_index('resp_time')
    df = df.resample('D').sum()
    df.columns = ['social_' + i for i in df.columns]
    return df

In [24]:
# .apply(lambda x: 0 if stats.mode(x)[0].shape[0] == 0 else np.asscalar(stats.mode(x)[0]))

In [25]:
def org_class1(df):
    df = df.drop(columns=['course_id'])
    df = df.set_index('resp_time')
    # replace values to make the results in order
    df['experience'] = df['experience'].replace([1,2,3,4,5], [3,5,4,2,1])
    df['due'] = df['due'].replace([1,2], [1,0])
    df = df.resample('D').mean()
    df = df.dropna(how='all')
    df.columns = ['class1_' + i for i in df.columns]
    return df

In [26]:
def org_class2(df):
    df = df.set_index('resp_time')
    df['challenge'] = df['challenge'].replace([1,2,3,4,5,6], [6,5,4,3,2,1])
    df['effort'] = df['effort'].replace([1,2,3,4,5,6], [6,5,4,3,2,1])
    df['grade'] = df['grade'].replace([1,2,3,4,5,6,7,8], [8,7,6,5,4,3,2,1])
    df = df.resample('W').mean()
    df = df.dropna(how='all')
    df.columns = ['class2_' + i for i in df.columns]
    return df

In [27]:
def org_stress(df):
    df = df.set_index('resp_time')
    df['level'] = df['level'].replace([1,2,3], 1)
    df['level'] = df['level'].replace([4,5], 0)
    df = df.rename(columns = {'level':'STRESSED'})
    return df

In [28]:
def org_sleep(df):
    df = df.set_index('resp_time')
    df['rate'] = df['rate'].replace([1,2,3,4], [4,3,2,1])
    df = df.resample('D').mean()
    df.columns = ['sleep_' + i for i in df.columns]
    return df

In [29]:
def org_behavior(df):
    df = df.set_index('resp_time')
    df = df.dropna(how='all')
    df.columns = ['behavior_' + i for i in df.columns]
    return df

In [30]:
def org_exercise(df):
    df = df.set_index('resp_time')
    df['have'] = df['have'].replace([1,2], [1,0])
    df['schedule'] = df['schedule'].replace([1,2], [1,0])
    df = df.dropna(how='all')
    df.columns = ['exercise_' + i for i in df.columns]
    return df

In [31]:
def org_study_spaces(df):
    df = df.drop(columns=['place'])
    df = df.set_index('resp_time')
    df = df.dropna(how='all')
    df.columns = ['studyspace_' + i for i in df.columns]
    return df

In [32]:
def org_event(df):
    df = df.drop(columns=['nevent', 'pevent'])
    df = df.set_index('resp_time')
    df = df.dropna(how='all')
    df.columns = ['event_' + i for i in df.columns]
    return df

In [33]:
def org_mood(df):
    df = df.set_index('resp_time')
    df['happyornot'] = df['happyornot'].replace([1,2], [1,0])
    df['sadornot'] = df['sadornot'].replace([1,2], [1,0])
#     df['happy'] = df['happy'].replace(1, 0)
#     df['happy'] = df['happy'].replace([2,3,4], 1)
#     df['sad'] = df['sad'].replace(1, 0)
#     df['sad'] = df['sad'].replace([2,3,4], 1)
    df['happyornot'] = df['happyornot'].fillna(df['happy'])
    df['sadornot'] = df['sadornot'].fillna(df['sad'])
    df.columns = ['mood_' + i for i in df.columns]
    return df

In [34]:
mood1 = mood1.set_index('resp_time')
mood1.columns = ['mood1_' + i for i in mood1.columns]
mood2 = mood2.set_index('resp_time')
mood2.columns = ['mood2_' + i for i in mood2.columns]

In [49]:
mood

Unnamed: 0,happy,happyornot,resp_time,sad,sadornot
0,1,2.0,2013-04-25 06:09:55,3,1.0
1,2,1.0,2013-04-25 02:40:33,4,1.0
2,2,1.0,2013-05-16 15:43:07,1,1.0
3,3,1.0,2013-05-15 15:24:22,1,2.0
4,2,1.0,2013-05-17 14:59:10,1,
5,1,,2013-05-21 02:41:36,1,
6,1,2.0,2013-05-22 03:57:58,1,2.0
7,1,1.0,2013-05-23 02:40:11,1,1.0
8,1,,2013-08-07 01:47:19,1,
9,1,,2013-08-10 03:44:09,1,


# Education

In [36]:
deadlines = pd.read_csv(dir_loc + 'education/deadlines.csv').T

In [37]:
deadlines.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
uid,u01,u02,u03,u04,u05,u07,u08,u09,u10,u12,...,u47,u49,u50,u51,u52,u53,u54,u57,u58,u59
2013-03-27,0,0,0,0,0,0,0,0,0,1,...,0,2,0,0,0,0,0,0,0,0
2013-03-28,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2013-03-29,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,1,0,0,1
2013-03-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
deadlines_cols = deadlines.iloc[0, :]
deadlines = deadlines.iloc[1:, :]
deadlines.columns = deadlines_cols
deadlines = deadlines[deadlines.index < '2013-06-06']

In [39]:
org_stress(ema('u10', 'Stress', ['level']))

Unnamed: 0_level_0,STRESSED
resp_time,Unnamed: 1_level_1
2013-03-30 02:07:48,1.0
2013-04-01 07:19:02,1.0
2013-03-31 22:57:17,1.0
2013-04-02 02:32:04,0.0
2013-04-02 18:56:33,1.0
2013-03-28 00:01:02,1.0
2013-03-30 02:07:51,1.0
2013-03-30 02:07:45,1.0
2013-03-30 02:07:49,0.0
2013-03-30 04:28:19,1.0


In [40]:
df

Unnamed: 0,timestamp,activity_inference,audio_inference,conversation,gps_provider,gps_network_type,gps_accuracy,gps_latitude,gps_longitude,gps_altitude,...,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked
0,2013-03-27 04:00:01,0.0,,,,,,,,,...,,,,,,,,,,
1,2013-03-27 04:00:02,0.0,,,,,,,,,...,,,,,,,,,,
2,2013-03-27 04:00:03,0.0,,,,,,,,,...,,,,,,,,,,
3,2013-03-27 04:00:04,0.0,,,,,,,,,...,,,,,,,,,,
4,2013-03-27 04:00:05,0.0,,,,,,,,,...,,,,,,,,,,
5,2013-03-27 04:00:06,0.0,,,,,,,,,...,,,,,,,,,,
6,2013-03-27 04:00:07,0.0,,,,,,,,,...,,,,,,,,,,
7,2013-03-27 04:00:08,0.0,,,,,,,,,...,,,,,,,,,,
8,2013-03-27 04:00:09,0.0,,,,,,,,,...,,,,,,,,,,
9,2013-03-27 04:00:10,0.0,,,,,,,,,...,,,,,,,,,,
