In [1]:
import pandas as pd
from tpot import TPOTRegressor, TPOTClassifier
from sklearn.model_selection import train_test_split

In [8]:
data = pd.read_csv('tstat_log.csv')

# Feature Engineering

In [9]:
# must change all features to numerical values
# https://github.com/rhiever/tpot/blob/master/tutorials/Titanic_Kaggle.ipynb

# extract time series attributes
# https://github.com/blue-yonder/tsfresh

# date and time 
# https://github.com/crsmithdev/arrow

# scikit-plot
# https://github.com/reiinakano/scikit-plot

# only keep rows where ts2_tmode != 'Heat'
data = data[data.ts2_tmode != 'Heat']

# drop unneeded columns
data.drop('ts1_fmode', axis=1, inplace=True)
data.drop('ts1_fstate', axis=1, inplace=True)
data.drop('ts1_hold', axis=1, inplace=True)
data.drop('ts1_tstat_name', axis=1, inplace=True)
data.drop('ts1_t_heat', axis=1, inplace=True)
data.drop('ts2_t_heat', axis=1, inplace=True)
data.drop('ts2_fmode', axis=1, inplace=True)
data.drop('ts2_hold', axis=1, inplace=True)
data.drop('ts2_fstate', axis=1, inplace=True)
data.drop('wu_windchill_f', axis=1, inplace=True)
data.drop('ts2_tstat_name', axis=1, inplace=True)
data.drop('wu_station', axis=1, inplace=True)
data.drop('wu_precip_today_in', axis=1, inplace=True)
data.drop('tstat_id', axis=1, inplace=True)

# replace NaN with -999.0
data['wu_heat_index_f'].fillna(-999, inplace=True)
data['ts1_t_cool'].fillna(-999, inplace=True)
data['ts2_t_cool'].fillna(-999, inplace=True)

# drop measurements for other thermostat
data.drop('ts1_t_cool', axis=1, inplace=True)
data.drop('ts1_temp', axis=1, inplace=True)
data.drop('ts1_tstate', axis=1, inplace=True)

# replace -999.0 inches of rain per hour with 0
data.loc[data['wu_precip_1hr_in'] == -999.0, 'wu_precip_1hr_in'] = 0

# convert percent to integer
data['wu_relative_humidity'] = pd.to_numeric(data['wu_relative_humidity'].map(lambda x: x.strip('%')))

# one hot encode categorical data
wu_weather_encoded = pd.get_dummies(data['wu_weather'])
data.drop('wu_weather', axis=1, inplace=True)

ts1_tmode_encoded = pd.get_dummies(data['ts1_tmode'])
ts1_tmode_encoded.columns = ['ts1_tmode_' + s for s in ts1_tmode_encoded.columns]
data.drop('ts1_tmode', axis=1, inplace=True)

#ts1_tstate_encoded = pd.get_dummies(data['ts1_tstate'])
#ts1_tstate_encoded.columns = ['ts1_tstate_' + s for s in ts1_tstate_encoded.columns]
#data.drop('ts1_tstate', axis=1, inplace=True)

ts2_tmode_encoded = pd.get_dummies(data['ts2_tmode'])
ts2_tmode_encoded.columns = ['ts2_tmode_' + s for s in ts2_tmode_encoded.columns]
data.drop('ts2_tmode', axis=1, inplace=True)

ts2_tstate_encoded = pd.get_dummies(data['ts2_tstate'])
ts2_tstate_encoded.columns = ['ts2_tstate_' + s for s in ts2_tstate_encoded.columns]
data.drop('ts2_tstate', axis=1, inplace=True)

data = pd.concat([data, wu_weather_encoded, ts2_tmode_encoded, ts2_tstate_encoded], axis=1)

# create features from log_timestamp
# convert string to datetime
log_timestamp = pd.to_datetime(data['log_timestamp'],infer_datetime_format=True)
# get weekday
ts_weekday_name = pd.get_dummies(log_timestamp.dt.weekday_name)
# get hour
ts_hour = pd.get_dummies(log_timestamp.dt.hour)
ts_hour.columns = ['hour_' + str(s) for s in ts_hour.columns]
data = pd.concat([data, ts_weekday_name, ts_hour], axis=1)

data.drop('log_timestamp', axis=1, inplace=True)

In [10]:
# the target class/response variable is named class

# identify when the ac set point was lowered manually by comparing to the prior set point

# create a new shifted _t_ column to represent the prior setpoint
data['ts2_t_cool_prior'] = data.ts2_t_cool.shift(1)
data.loc[data.ts2_t_cool_prior > data.ts2_t_cool,'class'] = 1
data.loc[data.ts2_t_cool_prior <= data.ts2_t_cool,'class'] = 0
data.drop('ts2_t_cool_prior', axis=1, inplace=True)
data.fillna(0.0, inplace=True)
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27835,27836,27837,27838,27839,27840,27841,27842,27843,27844
ts2_t_cool,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,...,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0
ts2_temp,74.5,74.0,74.0,74.0,74.0,74.0,74.0,74.5,74.5,75.0,...,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
wu_UV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wu_dewpoint_f,74.8,75.2,75.2,75.2,75.2,75.2,75.2,75.6,75.7,75.4,...,45.9,45.5,49.0,45.1,45.5,45.3,45.1,44.6,45.0,44.8
wu_heat_index_f,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
wu_precip_1hr_in,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wu_relative_humidity,63.0,64.0,64.0,64.0,64.0,64.0,64.0,65.0,66.0,65.0,...,79.0,79.0,73.0,79.0,80.0,80.0,80.0,79.0,80.0,80.0
wu_temp_f,89.1,88.9,88.9,88.9,88.9,88.9,88.9,88.7,88.5,88.5,...,52.2,51.8,57.0,51.4,51.4,51.3,51.1,50.9,50.9,50.7
wu_wind_degrees,256.0,242.0,199.0,215.0,196.0,196.0,206.0,268.0,228.0,228.0,...,28.0,61.0,18.0,310.0,275.0,317.0,352.0,0.0,348.0,239.0
wu_wind_gust_mph,7.4,2.5,4.9,0.0,0.0,0.0,2.5,0.0,2.5,4.9,...,2.5,0.0,0.0,2.5,0.0,0.0,2.5,0.0,2.5,4.9


In [11]:
data.to_csv('tstat_log_prepared.csv', index=False)