# Summary of Dataframes
- `df_raw` original dataframe, exactly the same as the original `.csv` file
- `df_processed` dataframe without certain columns that are not used for future experiments
- `df_processed_indoor` filtered only within SDE
- `df_processed_indoor_time` `time` was processed as an `hour.minute` numerical feature


In [1]:
import time
import numpy as np
import pandas as pd
from datetime import datetime
from pytz import all_timezones

# helper functions
from cozie_functions import *

# Load raw file and preprocess features

In [2]:
raw_data_date = "2019-11-15"
df_raw = pd.read_csv(raw_data_date + "_cozie_full_masked.csv")
raw_features = df_raw.columns.values
print("Raw dataset dimension: {}".format(df_raw.shape))
print(raw_features)
df_raw.head(10)


Raw dataset dimension: (4378, 25)
['Unnamed: 0' 'index' 'time' 'clothing' 'comfort_cozie' 'heartRate_cozie'
 'lat_cozie' 'light_cozie' 'lon_cozie' 'noise_cozie' 'responseSpeed_cozie'
 'thermal_cozie' 'user_id' 'Floor' 'Latitude' 'Longitude' 'Space_id'
 'room' 'co2_sensing' 'humidity_sensing' 'light_sensing' 'noise_sensing'
 'temperature_sensing' 'voc_sensing' 'temperature_mbient']


Unnamed: 0.1,Unnamed: 0,index,time,clothing,comfort_cozie,heartRate_cozie,lat_cozie,light_cozie,lon_cozie,noise_cozie,...,Longitude,Space_id,room,co2_sensing,humidity_sensing,light_sensing,noise_sensing,temperature_sensing,voc_sensing,temperature_mbient
0,0,0,2019-09-28 09:07:28.561000+00:00,,,79.0,,,,,...,,,,,,,,,,
1,1,1,2019-09-29 09:07:30.131000+00:00,,,79.0,,,,,...,,,,,,,,,,
2,2,2,2019-09-30 00:55:46.387000+00:00,10.0,9.0,102.0,,10.0,,10.0,...,,,,,,,,,,
3,3,3,2019-09-30 01:04:23.821000+00:00,10.0,9.0,91.0,,10.0,,10.0,...,,,,,,,,,,
4,4,4,2019-09-30 03:07:36.976000+00:00,10.0,,93.0,,,,10.0,...,,,,,,,,,,
5,5,5,2019-09-30 03:07:42.184000+00:00,10.0,10.0,99.0,,10.0,,10.0,...,,,,,,,,,,
6,6,6,2019-09-30 03:30:56.306000+00:00,10.0,10.0,70.0,,10.0,,10.0,...,,,,,,,,,,
7,7,7,2019-09-30 04:00:57.716000+00:00,10.0,9.0,66.0,,10.0,,10.0,...,,,,,,,,,,
8,8,8,2019-09-30 04:08:23.750000+00:00,9.0,10.0,68.0,,10.0,,10.0,...,103.771051,45.0,0.0,732.0,50.8191,158.0,47.0,26.1212,66.0,
9,9,9,2019-09-30 04:37:53.719000+00:00,9.0,10.0,76.0,,10.0,,10.0,...,,,,,,,,,,


## Drop features not needed for experiments

In [3]:
df_processed = df_raw.copy()

In [4]:
# list of columns which won't be used on this experiment
# co2_sensing: some sensing sensors don't have that feature in some rooms
# clothing was only used for PMV
list_drop_columns = ['Unnamed: 0', 'index', 'clothing','comfort_cozie', 'responseSpeed_cozie', 'co2_sensing', 'voc_sensing', 'lat_cozie', 
                     'lon_cozie', 'Floor', 'Latitude', 'Longitude', 'Space_id']


In [5]:
# drop first two columns of indices
df_processed.drop(list_drop_columns, axis=1, inplace=True)
df_processed.head(10)


Unnamed: 0,time,heartRate_cozie,light_cozie,noise_cozie,thermal_cozie,user_id,room,humidity_sensing,light_sensing,noise_sensing,temperature_sensing,temperature_mbient
0,2019-09-28 09:07:28.561000+00:00,79.0,,,,cresh39,,,,,,
1,2019-09-29 09:07:30.131000+00:00,79.0,,,,cresh39,,,,,,
2,2019-09-30 00:55:46.387000+00:00,102.0,10.0,10.0,9.0,cresh39,,,,,,
3,2019-09-30 01:04:23.821000+00:00,91.0,10.0,10.0,11.0,cresh39,,,,,,
4,2019-09-30 03:07:36.976000+00:00,93.0,,10.0,,cresh39,,,,,,
5,2019-09-30 03:07:42.184000+00:00,99.0,10.0,10.0,10.0,cresh39,,,,,,
6,2019-09-30 03:30:56.306000+00:00,70.0,10.0,10.0,10.0,cresh39,,,,,,
7,2019-09-30 04:00:57.716000+00:00,66.0,10.0,10.0,9.0,cresh39,,,,,,
8,2019-09-30 04:08:23.750000+00:00,68.0,10.0,10.0,10.0,cresh35,0.0,50.8191,158.0,47.0,26.1212,
9,2019-09-30 04:37:53.719000+00:00,76.0,10.0,10.0,10.0,cresh35,,,,,,


## Drop rows with collection errors

In [6]:
# we prioritize thermal comfort votes in the instances
df_processed = df_processed[df_processed['thermal_cozie'].notnull()]
df_processed.head(10)


Unnamed: 0,time,heartRate_cozie,light_cozie,noise_cozie,thermal_cozie,user_id,room,humidity_sensing,light_sensing,noise_sensing,temperature_sensing,temperature_mbient
2,2019-09-30 00:55:46.387000+00:00,102.0,10.0,10.0,9.0,cresh39,,,,,,
3,2019-09-30 01:04:23.821000+00:00,91.0,10.0,10.0,11.0,cresh39,,,,,,
5,2019-09-30 03:07:42.184000+00:00,99.0,10.0,10.0,10.0,cresh39,,,,,,
6,2019-09-30 03:30:56.306000+00:00,70.0,10.0,10.0,10.0,cresh39,,,,,,
7,2019-09-30 04:00:57.716000+00:00,66.0,10.0,10.0,9.0,cresh39,,,,,,
8,2019-09-30 04:08:23.750000+00:00,68.0,10.0,10.0,10.0,cresh35,0.0,50.8191,158.0,47.0,26.1212,
9,2019-09-30 04:37:53.719000+00:00,76.0,10.0,10.0,10.0,cresh35,,,,,,
10,2019-09-30 04:48:46.900000+00:00,78.0,10.0,10.0,9.0,cresh35,1.0,96.7441,27.0,69.0,24.1869,
11,2019-09-30 04:53:00.125000+00:00,78.0,10.0,10.0,9.0,cresh35,2.0,,,,,
12,2019-09-30 05:02:37.388000+00:00,73.0,10.0,10.0,10.0,cresh35,2.0,,,,,


# Remove rows with missing data on mbient

In [7]:
df_processed = df_processed[df_processed['temperature_mbient'].notnull()]
print("First Batch within building (COMPLETE AMBIENT) dataset dimension: {}".format(df_processed.shape))
df_processed.head(10)


First Batch within building (COMPLETE AMBIENT) dataset dimension: (3908, 12)


Unnamed: 0,time,heartRate_cozie,light_cozie,noise_cozie,thermal_cozie,user_id,room,humidity_sensing,light_sensing,noise_sensing,temperature_sensing,temperature_mbient
14,2019-09-30 05:38:46.872000+00:00,62.0,10.0,10.0,10.0,cresh35,0.0,55.0346,200.0,47.0,23.0508,27.25
15,2019-09-30 07:07:37.348000+00:00,73.0,10.0,10.0,10.0,cresh39,,,,,,27.75
16,2019-09-30 07:16:21.293000+00:00,66.0,10.0,10.0,11.0,cresh36,,,,,,29.625
17,2019-09-30 07:59:54.490000+00:00,66.0,10.0,10.0,10.0,cresh36,,,,,,28.75
18,2019-09-30 08:43:06.012000+00:00,68.0,10.0,10.0,10.0,cresh35,,,,,,30.5
19,2019-09-30 08:53:55.188000+00:00,80.0,10.0,10.0,10.0,cresh39,3.0,69.6409,31.0,52.0,26.0001,28.875
20,2019-09-30 09:00:21.186000+00:00,70.0,10.0,10.0,11.0,cresh39,3.0,68.8295,32.0,47.0,26.0454,29.125
21,2019-09-30 09:03:45.390000+00:00,84.0,10.0,10.0,10.0,cresh39,3.0,68.6628,31.0,46.0,26.0728,28.75
23,2019-09-30 09:27:24.914000+00:00,74.0,10.0,10.0,11.0,cresh39,1.0,90.0784,90.0,66.0,26.4108,28.5
24,2019-09-30 09:35:26.361000+00:00,73.0,10.0,10.0,11.0,cresh36,,,,,,29.375


## Filter only Cresh participants

In [8]:
print("All participants in file:")
print(df_processed['user_id'].unique())
df_processed = df_processed[pd.to_numeric(df_processed['user_id'].str[5:]) <= 30]
print("Only Cresh participants:")
print(df_processed['user_id'].unique())


All participants in file:
['cresh35' 'cresh39' 'cresh36' 'cresh40' 'cresh37' 'cresh07' 'cresh10'
 'cresh08' 'cresh12' 'cresh09' 'cresh06' 'cresh02' 'cresh14' 'cresh13'
 'cresh15' 'cresh03' 'cresh05' 'cresh11' 'cresh01' 'cresh04' 'cresh22'
 'cresh16' 'cresh19' 'cresh21' 'cresh26' 'cresh29' 'cresh28' 'cresh27'
 'cresh18' 'cresh23' 'cresh20' 'cresh30' 'cresh17' 'cresh24' 'cresh25'
 'cresh41']
Only Cresh participants:
['cresh07' 'cresh10' 'cresh08' 'cresh12' 'cresh09' 'cresh06' 'cresh02'
 'cresh14' 'cresh13' 'cresh15' 'cresh03' 'cresh05' 'cresh11' 'cresh01'
 'cresh04' 'cresh22' 'cresh16' 'cresh19' 'cresh21' 'cresh26' 'cresh29'
 'cresh28' 'cresh27' 'cresh18' 'cresh23' 'cresh20' 'cresh30' 'cresh17'
 'cresh24' 'cresh25']


# Only votes indoor

In [9]:
df_processed_indoor = df_processed.copy()
df_processed_indoor = df_processed_indoor[df_processed_indoor['room'].notnull()]
# all sensing features will be missing but it's enough to check one
df_processed_indoor = df_processed_indoor[df_processed_indoor['humidity_sensing'].notnull()]
print("First Batch within building dataset dimension: {}".format(df_processed_indoor.shape))
df_processed_indoor.head(10)


First Batch within building dataset dimension: (1573, 12)


Unnamed: 0,time,heartRate_cozie,light_cozie,noise_cozie,thermal_cozie,user_id,room,humidity_sensing,light_sensing,noise_sensing,temperature_sensing,temperature_mbient
118,2019-10-04 05:20:19.351000+00:00,70.0,10.0,10.0,10.0,cresh07,4.0,54.3496,74.0,46.0,23.751,26.375
120,2019-10-04 05:21:01.479000+00:00,83.0,10.0,10.0,11.0,cresh10,4.0,54.3496,74.0,46.0,23.751,26.75
121,2019-10-04 05:27:55.857000+00:00,,10.0,10.0,10.0,cresh10,4.0,54.5601,70.0,44.0,23.6015,26.0
122,2019-10-04 05:30:27.209000+00:00,61.0,10.0,10.0,10.0,cresh07,4.0,54.4616,71.0,46.0,23.6015,26.25
123,2019-10-04 05:30:59.243000+00:00,92.0,10.0,10.0,11.0,cresh10,4.0,54.4616,71.0,46.0,23.6015,26.25
124,2019-10-04 05:51:17.360000+00:00,70.0,10.0,10.0,10.0,cresh07,4.0,54.8265,73.0,47.0,23.5289,25.25
128,2019-10-04 06:04:18.331000+00:00,76.0,10.0,10.0,10.0,cresh07,4.0,55.1793,65.0,47.0,23.6605,28.75
129,2019-10-04 06:04:34.742000+00:00,76.0,10.0,11.0,11.0,cresh10,4.0,55.1793,65.0,47.0,23.6605,27.75
136,2019-10-04 06:32:11.533000+00:00,114.0,10.0,10.0,10.0,cresh08,6.0,80.5197,44.0,62.0,27.7627,29.125
137,2019-10-04 06:35:38.733000+00:00,78.0,11.0,11.0,11.0,cresh07,1.0,78.8651,159.0,65.0,27.6248,31.5


# Convert `time` into a feature

Ref: http://blog.davidkaleko.com/feature-engineering-cyclical-features.html

In [10]:
df_processed_indoor_time = df_processed_indoor.copy()

# convert to Singapore time
df_processed_indoor_time['time'] = df_processed_indoor_time['time'].apply(pd.Timestamp).dt.tz_convert('Asia/Singapore')

# get minute of the day and day of the week
df_processed_indoor_time['time_minute'] = df_processed_indoor_time['time'].dt.hour * 60 + df_processed_indoor_time['time'].dt.minute
df_processed_indoor_time['day_of_week'] = df_processed_indoor_time['time'].dt.dayofweek

# Create cyclical features for the time and day of the week
df_processed_indoor_time['hour_sin'] = np.sin(df_processed_indoor_time.time_minute * (2. * np.pi/1440)) #24*60 = 1440
df_processed_indoor_time['hour_cos'] = np.cos(df_processed_indoor_time.time_minute * (2. * np.pi/1440))
df_processed_indoor_time['day_of_week_sin'] = np.sin(df_processed_indoor_time.day_of_week * (2. * np.pi/7))
df_processed_indoor_time['day_of_week_cos'] = np.cos(df_processed_indoor_time.day_of_week * (2. * np.pi/7))

# delete auxiliary columns
df_processed_indoor_time.drop(['time', 'time_minute', 'day_of_week'], axis=1, inplace=True)

print(df_processed_indoor_time.head(10))


     heartRate_cozie  light_cozie  noise_cozie  thermal_cozie  user_id  room  \
118             70.0         10.0         10.0           10.0  cresh07   4.0   
120             83.0         10.0         10.0           11.0  cresh10   4.0   
121              NaN         10.0         10.0           10.0  cresh10   4.0   
122             61.0         10.0         10.0           10.0  cresh07   4.0   
123             92.0         10.0         10.0           11.0  cresh10   4.0   
124             70.0         10.0         10.0           10.0  cresh07   4.0   
128             76.0         10.0         10.0           10.0  cresh07   4.0   
129             76.0         10.0         11.0           11.0  cresh10   4.0   
136            114.0         10.0         10.0           10.0  cresh08   6.0   
137             78.0         11.0         11.0           11.0  cresh07   1.0   

     humidity_sensing  light_sensing  noise_sensing  temperature_sensing  \
118           54.3496           74.0       

# Remove NaNs

In [11]:
df_processed_indoor_time.dropna(inplace=True)
print(df_processed_indoor_time.shape)
print(df_processed_indoor_time.head(10))


(1474, 15)
     heartRate_cozie  light_cozie  noise_cozie  thermal_cozie  user_id  room  \
118             70.0         10.0         10.0           10.0  cresh07   4.0   
120             83.0         10.0         10.0           11.0  cresh10   4.0   
122             61.0         10.0         10.0           10.0  cresh07   4.0   
123             92.0         10.0         10.0           11.0  cresh10   4.0   
124             70.0         10.0         10.0           10.0  cresh07   4.0   
128             76.0         10.0         10.0           10.0  cresh07   4.0   
129             76.0         10.0         11.0           11.0  cresh10   4.0   
136            114.0         10.0         10.0           10.0  cresh08   6.0   
137             78.0         11.0         11.0           11.0  cresh07   1.0   
138             73.0         10.0         10.0           11.0  cresh12   4.0   

     humidity_sensing  light_sensing  noise_sensing  temperature_sensing  \
118           54.3496           

# Create Feature Sets

<img src="../img/tiers.png">

## Measured variables are features

### Feature Set1: Time + Sensing

In [12]:
df_fs1 = df_processed_indoor_time.copy()
df_fs1.drop(['heartRate_cozie', 'room', 'temperature_mbient'], axis=1, inplace=True)
feature_set1 = df_fs1.columns.values

print("Feature Set1: {} \n Size: {}".format(feature_set1, df_fs1.shape))

Feature Set1: ['light_cozie' 'noise_cozie' 'thermal_cozie' 'user_id' 'humidity_sensing'
 'light_sensing' 'noise_sensing' 'temperature_sensing' 'hour_sin'
 'hour_cos' 'day_of_week_sin' 'day_of_week_cos'] 
 Size: (1474, 12)


### Feature Set2: Time + Sensing + Heart Rate + mbient

In [13]:
df_fs2 = df_processed_indoor_time.copy()
df_fs2.drop(['room'], axis=1, inplace=True)
feature_set2 = df_fs2.columns.values

print("Feature Set2: {} \n Size: {}".format(feature_set2, df_fs2.shape))

Feature Set2: ['heartRate_cozie' 'light_cozie' 'noise_cozie' 'thermal_cozie' 'user_id'
 'humidity_sensing' 'light_sensing' 'noise_sensing' 'temperature_sensing'
 'temperature_mbient' 'hour_sin' 'hour_cos' 'day_of_week_sin'
 'day_of_week_cos'] 
 Size: (1474, 14)


### Feature Set3: Time + Sensing + Heart Rate + mbient + room + preference history

In [14]:
df_fs3 = df_processed_indoor_time.copy()

# remap and calculate preference history for the user and for the room

# for Users
grouped_user_df = normalise_total_cozie(df_fs3, 'user_id', 0)
grouped_user_df.drop(["thermaly_comfy", 'aurally_comfy', 'visually_comfy'], axis=1, inplace=True)

# for Rooms
grouped_room_df = normalise_total_cozie(df_fs3, 'room', 0)
grouped_room_df.drop(["thermaly_comfy", 'aurally_comfy', 'visually_comfy'], axis=1, inplace=True)

# take average results and map it back to the feature_set
preferences = ['prefer_cooler', 'prefer_warmer', 'prefer_dimmer', 'prefer_brighter','prefer_quieter', 'prefer_louder']
for preference in preferences:
    map_dict = grouped_user_df[preference].to_dict()
    label = "user_grouped_" + preference.split("_")[1]
    df_fs3[label] = df_fs3['user_id'].map(map_dict)
for preference in preferences:
    map_dict = grouped_room_df[preference].to_dict()
    label = "room_grouped_" + preference.split("_")[1]
    df_fs3[label] = df_fs3['room'].map(map_dict)
    
# do this by first creating a dictionary, and then running the .map method
df_fs3.dropna(subset=["user_grouped_cooler", "user_grouped_warmer", "room_grouped_cooler", "room_grouped_warmer"], inplace=True)

feature_set3 = df_fs3.columns.values

print("Feature Set3: {} \n Size: {}".format(feature_set3, df_fs3.shape))


Feature Set3: ['heartRate_cozie' 'light_cozie' 'noise_cozie' 'thermal_cozie' 'user_id'
 'room' 'humidity_sensing' 'light_sensing' 'noise_sensing'
 'temperature_sensing' 'temperature_mbient' 'hour_sin' 'hour_cos'
 'day_of_week_sin' 'day_of_week_cos' 'user_grouped_cooler'
 'user_grouped_warmer' 'user_grouped_dimmer' 'user_grouped_brighter'
 'user_grouped_quieter' 'user_grouped_louder' 'room_grouped_cooler'
 'room_grouped_warmer' 'room_grouped_dimmer' 'room_grouped_brighter'
 'room_grouped_quieter' 'room_grouped_louder'] 
 Size: (1474, 27)


### Feature Set4: Time + Heart Rate + mbient + room + preference history

In [15]:
df_fs4 = df_fs3.copy()

df_fs4.drop(['humidity_sensing', 'light_sensing', 'noise_sensing', 'temperature_sensing'], axis=1, inplace=True)
feature_set4 = df_fs4.columns.values

print("Feature Set4: {} \n Size: {}".format(feature_set4, df_fs4.shape))

Feature Set4: ['heartRate_cozie' 'light_cozie' 'noise_cozie' 'thermal_cozie' 'user_id'
 'room' 'temperature_mbient' 'hour_sin' 'hour_cos' 'day_of_week_sin'
 'day_of_week_cos' 'user_grouped_cooler' 'user_grouped_warmer'
 'user_grouped_dimmer' 'user_grouped_brighter' 'user_grouped_quieter'
 'user_grouped_louder' 'room_grouped_cooler' 'room_grouped_warmer'
 'room_grouped_dimmer' 'room_grouped_brighter' 'room_grouped_quieter'
 'room_grouped_louder'] 
 Size: (1474, 23)


### Feature Set5: Time + Heart Rate + room + preference history

In [16]:
df_fs5 = df_fs4.copy()
df_fs5.drop(['temperature_mbient'], axis=1, inplace=True)
feature_set5 = df_fs5.columns.values

print("Feature Set5: {} \n Size: {}".format(feature_set5, df_fs5.shape))

Feature Set5: ['heartRate_cozie' 'light_cozie' 'noise_cozie' 'thermal_cozie' 'user_id'
 'room' 'hour_sin' 'hour_cos' 'day_of_week_sin' 'day_of_week_cos'
 'user_grouped_cooler' 'user_grouped_warmer' 'user_grouped_dimmer'
 'user_grouped_brighter' 'user_grouped_quieter' 'user_grouped_louder'
 'room_grouped_cooler' 'room_grouped_warmer' 'room_grouped_dimmer'
 'room_grouped_brighter' 'room_grouped_quieter' 'room_grouped_louder'] 
 Size: (1474, 22)


### Feature Set6: Time + room + preference history

In [17]:
df_fs6 = df_fs5.copy()
df_fs6.drop(['heartRate_cozie'], axis=1, inplace=True)
feature_set6 = df_fs6.columns.values

print("Feature Set5: {} \n Size: {}".format(feature_set6, df_fs6.shape))

Feature Set5: ['light_cozie' 'noise_cozie' 'thermal_cozie' 'user_id' 'room' 'hour_sin'
 'hour_cos' 'day_of_week_sin' 'day_of_week_cos' 'user_grouped_cooler'
 'user_grouped_warmer' 'user_grouped_dimmer' 'user_grouped_brighter'
 'user_grouped_quieter' 'user_grouped_louder' 'room_grouped_cooler'
 'room_grouped_warmer' 'room_grouped_dimmer' 'room_grouped_brighter'
 'room_grouped_quieter' 'room_grouped_louder'] 
 Size: (1474, 21)


# Save Dataframes

In [18]:
def save_df(dataframe, file_name):
#     new_name = str(datetime.date(datetime.now())) + "_" + file_name + ".csv" raw_data_date
    new_name = raw_data_date + "_" + file_name + ".csv"
    dataframe.to_csv("data-processed-preferences/" + new_name, index=False)
    

In [19]:
dataframes = [df_fs1, df_fs2, df_fs3, df_fs4, df_fs5, df_fs6]
dataframes_names = ['fs1', 'fs2', 'fs3', 'fs4', 'fs5', 'fs6']
for df, df_name in zip(dataframes, dataframes_names):
    save_df(df, df_name)
    

In [20]:
# rows 1474