In [1]:
import pandas as pd
from tpot import TPOTRegressor, TPOTClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('tstat_log.csv')

# Feature Engineering

In [3]:
# must change all features to numerical values
# https://github.com/rhiever/tpot/blob/master/tutorials/Titanic_Kaggle.ipynb

# extract time series attributes
# https://github.com/blue-yonder/tsfresh

# date and time 
# https://github.com/crsmithdev/arrow

# scikit-plot
# https://github.com/reiinakano/scikit-plot

# drop unneeded columns
data.drop('ts1_fmode', axis=1, inplace=True)
data.drop('ts1_fstate', axis=1, inplace=True)
data.drop('ts1_hold', axis=1, inplace=True)
data.drop('ts1_tstat_name', axis=1, inplace=True)
data.drop('ts1_t_heat', axis=1, inplace=True)
data.drop('ts2_fmode', axis=1, inplace=True)
data.drop('ts2_hold', axis=1, inplace=True)
data.drop('ts2_fstate', axis=1, inplace=True)
data.drop('wu_windchill_f', axis=1, inplace=True)
data.drop('ts2_tstat_name', axis=1, inplace=True)
data.drop('wu_station', axis=1, inplace=True)
data.drop('wu_precip_today_in', axis=1, inplace=True)
data.drop('tstat_id', axis=1, inplace=True)

# replace -999.0 inches of rain per hour with 0
data.loc[data['wu_precip_1hr_in'] == -999.0, 'wu_precip_1hr_in'] = 0

# replace NaN with -999.0
data['wu_heat_index_f'].fillna(-999, inplace=True)
data['ts1_t_cool'].fillna(-999, inplace=True)

# convert percent to integer
data['wu_relative_humidity'] = pd.to_numeric(data['wu_relative_humidity'].map(lambda x: x.strip('%')))

# one hot encode categorical data
wu_weather_encoded = pd.get_dummies(data['wu_weather'])
data.drop('wu_weather', axis=1, inplace=True)

ts1_tmode_encoded = pd.get_dummies(data['ts1_tmode'])
ts1_tmode_encoded.columns = ['ts1_tmode_' + s for s in ts1_tmode_encoded.columns]
data.drop('ts1_tmode', axis=1, inplace=True)

ts1_tstate_encoded = pd.get_dummies(data['ts1_tstate'])
ts1_tstate_encoded.columns = ['ts1_tstate_' + s for s in ts1_tstate_encoded.columns]
data.drop('ts1_tstate', axis=1, inplace=True)

ts2_tmode_encoded = pd.get_dummies(data['ts2_tmode'])
ts2_tmode_encoded.columns = ['ts2_tstate_' + s for s in ts2_tmode_encoded.columns]
data.drop('ts2_tmode', axis=1, inplace=True)

ts2_tstate_encoded = pd.get_dummies(data['ts2_tstate'])
ts2_tstate_encoded.columns = ['ts2_tstate_' + s for s in ts2_tstate_encoded.columns]
data.drop('ts2_tstate', axis=1, inplace=True)

data = pd.concat([data, wu_weather_encoded, ts1_tmode_encoded, ts1_tstate_encoded, ts2_tmode_encoded, ts2_tstate_encoded], axis=1)

# create features from log_timestamp
# convert string to datetime
log_timestamp = pd.to_datetime(data['log_timestamp'],infer_datetime_format=True)
# get weekday
ts_weekday_name = pd.get_dummies(log_timestamp.dt.weekday_name)
# get hour
ts_hour = pd.get_dummies(log_timestamp.dt.hour)
ts_hour.columns = ['hour_' + str(s) for s in ts_hour.columns]
data = pd.concat([data, ts_weekday_name, ts_hour], axis=1)

data.drop('log_timestamp', axis=1, inplace=True)

In [4]:
# the target class/response variable is named class

# identify when the ac set point was lowered manually by comparing to the prior set point

# create a new shifted _t_ column to represent the prior setpoint
data['ts1_t_cool_prior'] = data.ts1_t_cool.shift(-1)
data.loc[data.ts1_t_cool_prior > data.ts1_t_cool,'class'] = 1
data.loc[data.ts1_t_cool_prior <= data.ts1_t_cool,'class'] = 0
data.drop('ts1_t_cool_prior', axis=1, inplace=True)
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23163,23164,23165,23166,23167,23168,23169,23170,23171,23172
ts1_t_cool,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,...,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0
ts1_temp,75.0,75.5,75.5,76.0,75.5,75.0,75.0,75.0,75.5,75.5,...,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,74.5,74.5
ts2_temp,75.5,76.0,76.0,76.0,76.0,75.5,75.0,75.0,75.5,76.0,...,74.5,74.5,74.5,75.0,75.0,75.0,75.0,75.0,75.0,75.0
wu_UV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wu_dewpoint_f,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,68.0,...,54.7,54.7,54.5,54.9,54.7,54.7,54.5,54.5,54.3,54.3
wu_heat_index_f,84.0,84.0,84.0,84.0,84.0,83.0,83.0,83.0,83.0,83.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
wu_precip_1hr_in,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wu_relative_humidity,61.0,61.0,62.0,62.0,63.0,63.0,63.0,64.0,64.0,66.0,...,89.0,89.0,89.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0
wu_temp_f,81.6,81.6,81.4,81.4,81.2,80.9,80.9,80.7,80.5,80.3,...,57.9,57.9,57.7,57.7,57.6,57.6,57.4,57.4,57.2,57.2
wu_wind_degrees,90.0,112.0,22.0,202.0,22.0,292.0,22.0,112.0,112.0,22.0,...,322.0,349.0,336.0,329.0,19.0,358.0,316.0,31.0,8.0,348.0


In [5]:

# data['wu_weather'].value_counts()
# data.dtypes

In [6]:
target_column_name = 'class'
target = data[target_column_name]
data.drop(target_column_name, axis=1, inplace=True)

target.fillna(0.0, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                    train_size=0.75, test_size=0.25)



In [7]:
#print(target.isnull().any())
data.isnull().any()
y_train.shape
y_test.shape
y_test.isnull().any()


False

In [14]:
tpot = TPOTClassifier(generations=1, population_size=5, verbosity=2, n_jobs=-1)
tpot.fit(X_train, y_train)

                                                                            

Generation 1 - Current best internal CV score: 0.9964900363855304

Best pipeline: DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=6, min_samples_leaf=14, min_samples_split=16)




TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=1, max_eval_time_mins=5,
        max_time_mins=None, mutation_rate=0.9, n_jobs=4, offspring_size=5,
        periodic_checkpoint_folder=None, population_size=5,
        random_state=None, scoring=None, subsample=1.0, verbosity=2,
        warm_start=False)

In [15]:
print(tpot.score(X_test, y_test))

0.996375560925


In [16]:
tpot.export('tstat_pipeline.py')

True