In [51]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [52]:
tpot_data = pd.read_csv('tstat_log_prepared_balanced.csv', sep=',', dtype=np.float64)
features = tpot_data.drop('class', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'].values, random_state=42)
tpot_data.columns[:-1]

Index(['ts2_t_cool', 'ts2_temp', 'wu_UV', 'wu_dewpoint_f', 'wu_heat_index_f',
       'wu_precip_1hr_in', 'wu_relative_humidity', 'wu_temp_f',
       'wu_wind_degrees', 'wu_wind_gust_mph',
       ...
       'Wednesday_21', 'Wednesday_22', 'Wednesday_23', 'Wednesday_3',
       'Wednesday_4', 'Wednesday_5', 'Wednesday_6', 'Wednesday_7',
       'Wednesday_8', 'Wednesday_9'],
      dtype='object', length=190)

In [53]:
#exported_pipeline = LogisticRegression(C=25.0, dual=False, penalty="l2") # 0.96799999999999997
#exported_pipeline = LinearSVC(C=15.0, dual=False, loss="squared_hinge", penalty="l1", tol=0.01) # 0.96799999999999997
#exported_pipeline = ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=20, min_samples_split=18, n_estimators=100) # 0.94399999999999995
#exported_pipeline = RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.6000000000000001, min_samples_leaf=9, min_samples_split=19, n_estimators=100) # 0.98399999999999999
#exported_pipeline = RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.45, min_samples_leaf=18, min_samples_split=5, n_estimators=100) # 0.99037632864119507
exported_pipeline = XGBClassifier(learning_rate=0.001, max_depth=5, min_child_weight=7, n_estimators=100, nthread=1, subsample=0.7500000000000001)


In [54]:
exported_pipeline.fit(training_features, training_target)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=5,
       min_child_weight=7, missing=None, n_estimators=100, nthread=1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True,
       subsample=0.7500000000000001)

In [55]:
for fi in zip(tpot_data.columns[:-1], exported_pipeline.feature_importances_):
    if fi[1] > 0.01:
        print(fi)

('ts2_t_cool', 0.29673591)
('ts2_temp', 0.29673591)
('wu_UV', 0.026706232)
('wu_dewpoint_f', 0.011869436)
('wu_heat_index_f', 0.03560831)
('wu_relative_humidity', 0.06231454)
('wu_temp_f', 0.03560831)
('wu_wind_degrees', 0.13649851)
('Clear', 0.011869436)
('Partly Cloudy', 0.080118693)


In [56]:
results = exported_pipeline.predict(testing_features)

In [57]:
print('bad predictions')
print('target','prediction')
#print(tpot_data.columns[:-1])
for f, t, p in zip(testing_features, testing_target, results):
    if p != t:
    #    print(f, t, p)
        print("Actual outcome :: {} and Predicted outcome :: {}".format(t, p))

bad predictions
target prediction
Actual outcome :: 0.0 and Predicted outcome :: 1.0
Actual outcome :: 0.0 and Predicted outcome :: 1.0
Actual outcome :: 1.0 and Predicted outcome :: 0.0
Actual outcome :: 1.0 and Predicted outcome :: 0.0
Actual outcome :: 0.0 and Predicted outcome :: 1.0
Actual outcome :: 0.0 and Predicted outcome :: 1.0
Actual outcome :: 0.0 and Predicted outcome :: 1.0
Actual outcome :: 0.0 and Predicted outcome :: 1.0
Actual outcome :: 0.0 and Predicted outcome :: 1.0


In [58]:
exported_pipeline.score(testing_features, testing_target)

0.970873786407767

In [59]:
print(" Confusion matrix\n", confusion_matrix(testing_target, results))

 Confusion matrix
 [[244   7]
 [  2  56]]


In [60]:
# after trained with balanced data, try with unbalanced data set
tpot_data = pd.read_csv('tstat_log_prepared.csv', sep=',', dtype=np.float64)
features = tpot_data.drop('class', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'].values, random_state=42)
tpot_data.columns[:-1]

results = exported_pipeline.predict(testing_features)

print(exported_pipeline.score(testing_features, testing_target))
print(" Confusion matrix\n", confusion_matrix(testing_target, results))

0.965527147371
 Confusion matrix
 [[6661  234]
 [   6   61]]
