# Casas smarthome ADS Coursework

In [235]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [236]:
import datetime
import os
import re
import seaborn as sns
from collections import Counter
from datetime import datetime



In [237]:
sensor_codes = ['M001', 'M002', 'M003', 'M004', 'M005', 'M006', 'M007', 
                'M008', 'M009', 'M010', 'M011', 'M012', 'M013', 'M014',
                'M015', 'M016', 'M017', 'M018', 'M019', 'M020', 'M021', 
                'M022', 'M023', 'M024', 'M025', 'M026', 'M027', 'M028',
                'M029', 'M030', 'D001', 'D002', 'D003', 'D004', 'T001',
                'T002', 'T003', 'T004', 'T005']

# Data cleaning

In [238]:
def load_dataset(filename):
    timestamps = []
    sensors = []
    values = []
    activities = []
    current_activity = ''  # empty

    with open(filename, 'rb') as features:
        database = features.readlines()
        
        for i, line in enumerate(database):  # each line
            f_info = line.decode().split()  # find fields
            try:
                if 'M' == f_info[2][0] or 'D' == f_info[2][0] or 'T' == f_info[2][0]:
                    if str(np.array(f_info[2])) in sensor_codes:
                        sensors.append(str(np.array(f_info[2])))
                    else:
                        continue
                       
                    if 'OFF' in f_info[3]:
                        values.append('OFF')
                    elif 'ON' in f_info[3]:
                        values.append('ON')
                    else:
                        try:
                            values.append(float(str(np.array(f_info[3]))))
                        except ValueError:
                            del sensors[-1]
                            continue
                            
                    # choose only M D T sensors, avoiding unexpected errors
                    if not ('.' in str(np.array(f_info[0])) + str(np.array(f_info[1]))):
                        f_info[1] = f_info[1] + '.000000'
                        
                    try:
                        timestamps.append(datetime.strptime(str(np.array(f_info[0])) + str(np.array(f_info[1])),
                                                        "%Y-%m-%d%H:%M:%S.%f"))
                    except ValueError:
                        del sensors[-1]
                        del values[-1]
                        continue
                        
                    if len(f_info) == 4:  # if activity does not exist
                        activities.append(current_activity)
                    else:  # if activity exists
                        des = str(' '.join(np.array(f_info[4:])))
                        if 'begin' in des:
                            current_activity = re.sub('begin', '', des)
                            if current_activity[-1] == ' ':  # if white space at the end
                                current_activity = current_activity[:-1]  # delete white space
                            activities.append(current_activity)
                        if 'end' in des:
                            activities.append(current_activity)
                            current_activity = ''
            except IndexError:
                print(i, line)
                
    features.close()
    
    return pd.DataFrame(np.transpose(np.array([timestamps, sensors, values, activities])), 
                        columns = ['Timestamps', 'Sensors', 'Values', 'Activities'])
    

In [239]:
aruba_dataset = load_dataset("./datasets/aruba/data")

In [240]:
#copy_aruba = aruba_dataset[:5000].copy();
copy_aruba = aruba_dataset.copy();

In [241]:
def add_transition_labels(aruba_set):
    new_activity = ''
    previous_activity = ''
    
    for i, entry in enumerate(aruba_set['Activities']): 
        if entry == '':
            if new_activity != '':
                aruba_set.at[i, 'Activities'] = new_activity
            else:
                for next_entry in aruba_set['Activities'][i:]:
                    if next_entry != '':
                        new_activity = 'Transition_' + previous_activity + '_' + next_entry
                        aruba_set.at[i, 'Activities'] = new_activity
                        break            
        else:
            previous_activity = entry
            new_activity = ''
    return aruba_set

In [242]:
complete_aruba = add_transition_labels(copy_aruba)

In [243]:
complete_aruba.head(10)

Unnamed: 0,Timestamps,Sensors,Values,Activities
0,2010-11-04 00:03:50.209589,M003,ON,Sleeping
1,2010-11-04 00:03:57.399391,M003,OFF,Sleeping
2,2010-11-04 00:15:08.984841,T002,21.5,Sleeping
3,2010-11-04 00:30:19.185547,T003,21.0,Sleeping
4,2010-11-04 00:30:19.385336,T004,21.0,Sleeping
5,2010-11-04 00:35:22.245870,T005,20.5,Sleeping
6,2010-11-04 00:40:25.428962,T005,21.0,Sleeping
7,2010-11-04 00:45:28.658171,T005,20.5,Sleeping
8,2010-11-04 01:05:42.269469,T001,20.0,Sleeping
9,2010-11-04 01:15:48.936777,T002,21.0,Sleeping


# Pre-processing

In [244]:
def process_time_data(dataset):
    weekday = []
    seconds = []
    daytime = []
    
    for instance in dataset["Timestamps"]:
        weekday.append(instance.day_of_week)
        seconds.append(instance.hour * 3600 + instance.minute * 60 + instance.second)
        if instance.hour >= 0 and instance.hour <= 6:
            daytime.append('Morning_Night')
        elif instance.hour >= 6 and instance.hour <= 12:
            daytime.append('Morning_Day')
        elif instance.hour >= 12 and instance.hour <= 18:
            daytime.append('Afternoon')
        else: 
            daytime.append('Evening')
        
    dataset["Weekday"] = weekday
    dataset["Seconds"] = seconds
    dataset["Daytime"] = daytime
    return dataset


In [245]:
copy_complete_aruba = complete_aruba.copy();

In [246]:
timed_aruba = process_time_data(copy_complete_aruba)

In [247]:
timed_aruba.head()


Unnamed: 0,Timestamps,Sensors,Values,Activities,Weekday,Seconds,Daytime
0,2010-11-04 00:03:50.209589,M003,ON,Sleeping,3,230,Morning_Night
1,2010-11-04 00:03:57.399391,M003,OFF,Sleeping,3,237,Morning_Night
2,2010-11-04 00:15:08.984841,T002,21.5,Sleeping,3,908,Morning_Night
3,2010-11-04 00:30:19.185547,T003,21.0,Sleeping,3,1819,Morning_Night
4,2010-11-04 00:30:19.385336,T004,21.0,Sleeping,3,1819,Morning_Night


In [248]:
timed_aruba.describe()

Unnamed: 0,Weekday,Seconds
count,1709857.0,1709857.0
mean,2.988941,50176.61
std,2.02184,18949.67
min,0.0,0.0
25%,1.0,34860.0
50%,3.0,51721.0
75%,5.0,65661.0
max,6.0,86399.0


In [249]:
data = timed_aruba.drop(columns=['Timestamps'])

In [250]:
data.head()

Unnamed: 0,Sensors,Values,Activities,Weekday,Seconds,Daytime
0,M003,ON,Sleeping,3,230,Morning_Night
1,M003,OFF,Sleeping,3,237,Morning_Night
2,T002,21.5,Sleeping,3,908,Morning_Night
3,T003,21.0,Sleeping,3,1819,Morning_Night
4,T004,21.0,Sleeping,3,1819,Morning_Night


In [251]:
sensors_col = data['Sensors'].copy()
values_col = data['Values'].copy()

print(sensors_col[:5])
print(values_col[:5])

0    M003
1    M003
2    T002
3    T003
4    T004
Name: Sensors, dtype: object
0      ON
1     OFF
2    21.5
3    21.0
4    21.0
Name: Values, dtype: object


In [252]:
data2 = data.drop(columns=['Sensors', 'Values'])
data2.head()

Unnamed: 0,Activities,Weekday,Seconds,Daytime
0,Sleeping,3,230,Morning_Night
1,Sleeping,3,237,Morning_Night
2,Sleeping,3,908,Morning_Night
3,Sleeping,3,1819,Morning_Night
4,Sleeping,3,1819,Morning_Night


In [253]:
temperatures = []
for x in values_col:
    try:
        temperatures.append(float(x))
    except ValueError:
        pass

mean_temp = np.array(temperatures).mean()
print(mean_temp)

23.018337522518657


In [254]:
def init_df_state(sensor_codes, mean_temp):
    state_df = pd.DataFrame()
    for sensor in sensor_codes:
        if sensor[0] == 'T':
            state_df.at[0, sensor] = mean_temp
        else:
            state_df.at[0, sensor] = 0 #OFF
            
    return state_df

In [255]:
state_df = init_df_state(sensor_codes, mean_temp)
state_df

Unnamed: 0,M001,M002,M003,M004,M005,M006,M007,M008,M009,M010,...,M030,D001,D002,D003,D004,T001,T002,T003,T004,T005
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,23.018338,23.018338,23.018338,23.018338,23.018338


In [262]:
def compute_sensor_state(current_state, sensors, values):
    state = pd.DataFrame(columns=current_state.columns)
    #state = current_state.copy()
    
    for i in range(0, len(sensors)):
        if values[i] == 'ON' or values[i] == 'OPEN':
            current_state.at[0, sensors[i]] = 1
        elif values[i] == 'OFF' or values[i] == 'CLOSE':
            current_state.at[0, sensors[i]] = 0
        else:
            current_state.at[0, sensors[i]] = float(values[i])
            
        state = state.append(pd.Series(current_state.iloc[0]), ignore_index=True)
       
    
    return state
    

In [257]:
print('talent')


talent


In [None]:
state = compute_sensor_state(state_df, sensors_col, values_col)

In [None]:
state.shape

In [None]:
state.describe()

In [None]:
data2.shape
data2

In [None]:
state

In [None]:
new_data = pd.concat([data2, state], axis=1).reindex(data2.index)

In [None]:
new_data.head()

In [None]:
#fig = plt.figure(figsize=(30,2))
#plt.pie(new_data['Activities'])
#plt.show()
from collections import Counter

count = Counter(new_data['Activities'])
print(count)

In [None]:
keys = []
values = []
for key, value in count.items():
    keys.append(key)
    values.append(value)
    
print(keys)
print(values)

In [None]:
fig = plt.figure(figsize=(10,10))
plt.pie(values, labels=keys,colors=sns.color_palette('bright')[0:20], autopct='%.0f%%')
plt.show()

In [None]:
fig = plt.figure(figsize=(30,5))
plt.xticks(rotation='vertical', fontsize=20)
plt.bar(keys, values)