# Predict event type

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import preprocessing
import sklearn.model_selection

In [2]:
bin_columns = ('Day', 'Date', 'Time', 'People_In', 'People_Out', 'IsEvent')

bin_data = pd.read_csv('data/building_event_binary.txt', names=bin_columns)
print bin_data.dtypes, "\n"
print bin_data.head(), "\n"
bin_data.describe()

Day           object
Date          object
Time          object
People_In      int64
People_Out     int64
IsEvent       object
dtype: object 

      Day      Date      Time  People_In  People_Out  IsEvent
0  Sunday  07/24/05  00:00:00          0           0  noevent
1  Sunday  07/24/05  00:30:00          1           0  noevent
2  Sunday  07/24/05  01:00:00          0           0  noevent
3  Sunday  07/24/05  01:30:00          0           0  noevent
4  Sunday  07/24/05  02:00:00          0           0  noevent 



Unnamed: 0,People_In,People_Out
count,5040.0,5040.0
mean,3.7375,3.888294
std,6.010372,6.834565
min,0.0,0.0
25%,0.0,0.0
50%,1.0,0.0
75%,5.0,5.25
max,54.0,62.0


In [3]:
mult_columns = ('Day', 'Date', 'Time', 'People_In', 'People_Out', 'EventType')

mult_data = pd.read_csv('data/building_event_multiclass.txt', names=mult_columns)
print mult_data.dtypes, "\n"
print mult_data.head(), "\n"
mult_data.describe()

Day           object
Date          object
Time          object
People_In      int64
People_Out     int64
EventType     object
dtype: object 

       Day      Date      Time  People_In  People_Out EventType
0  Tuesday  07/26/05  11:30:00          9          26    eventA
1  Tuesday  07/26/05  12:00:00          8          13    eventA
2  Tuesday  07/26/05  12:30:00          6           3    eventA
3  Tuesday  07/26/05  13:00:00         25          12    eventA
4  Tuesday  07/26/05  13:30:00         12          12    eventA 



Unnamed: 0,People_In,People_Out
count,176.0,176.0
mean,13.647727,14.801136
std,9.398528,9.666449
min,0.0,0.0
25%,7.75,7.0
50%,12.0,13.0
75%,18.25,21.0
max,49.0,55.0


In [4]:
# Encode the data with the label encoder
# Each column needs to be encoded separately - disadvantage of using Label Encoding
# Don't want to One-Hot all of the dates though
# Only doing it this way because of limited number of columns
# Better way here: http://stackoverflow.com/a/31939145

le_both_day = preprocessing.LabelEncoder() # seven days are standard
le_bin_date = preprocessing.LabelEncoder()
le_mult_date = preprocessing.LabelEncoder()
le_both_time = preprocessing.LabelEncoder() # time steps are each half hour
le_bin_isevent = preprocessing.LabelEncoder()
le_mult_eventtype = preprocessing.LabelEncoder()

In [5]:
bin_data_dummies = pd.DataFrame(data=None, columns=bin_data.columns, index=bin_data.index)
bin_data_dummies.Day = le_both_day.fit_transform(bin_data.Day)
bin_data_dummies.Date = le_bin_date.fit_transform(bin_data.Date)
bin_data_dummies.Time = le_both_time.fit_transform(bin_data.Time)
bin_data_dummies.People_In = bin_data.People_In
bin_data_dummies.People_Out = bin_data.People_Out
bin_data_dummies.IsEvent = le_bin_isevent.fit_transform(bin_data.IsEvent)

print bin_data_dummies.dtypes, "\n"
print bin_data_dummies.head(), "\n"
bin_data_dummies.describe()

Day           int64
Date          int64
Time          int64
People_In     int64
People_Out    int64
IsEvent       int64
dtype: object 

   Day  Date  Time  People_In  People_Out  IsEvent
0    3     0     0          0           0        1
1    3     0     1          1           0        1
2    3     0     2          0           0        1
3    3     0     3          0           0        1
4    3     0     4          0           0        1 



Unnamed: 0,Day,Date,Time,People_In,People_Out,IsEvent
count,5040.0,5040.0,5040.0,5040.0,5040.0,5040.0
mean,3.0,52.0,23.5,3.7375,3.888294,0.965079
std,2.000198,30.312522,13.854774,6.010372,6.834565,0.183597
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,26.0,11.75,0.0,0.0,1.0
50%,3.0,52.0,23.5,1.0,0.0,1.0
75%,5.0,78.0,35.25,5.0,5.25,1.0
max,6.0,104.0,47.0,54.0,62.0,1.0


In [6]:
mult_data_dummies = pd.DataFrame(data=None, columns=mult_data.columns, index=mult_data.index)
mult_data_dummies.Day = le_both_day.fit_transform(mult_data.Day)
mult_data_dummies.Date = le_mult_date.fit_transform(mult_data.Date)
mult_data_dummies.Time = le_both_time.fit_transform(mult_data.Time)
mult_data_dummies.People_In = mult_data.People_In
mult_data_dummies.People_Out = mult_data.People_Out
mult_data_dummies.EventType = le_mult_eventtype.fit_transform(mult_data.EventType)

print mult_data_dummies.dtypes, "\n"
print mult_data_dummies.head(), "\n"
mult_data_dummies.describe()

Day           int64
Date          int64
Time          int64
People_In     int64
People_Out    int64
EventType     int64
dtype: object 

   Day  Date  Time  People_In  People_Out  EventType
0    4     0     6          9          26          0
1    4     0     7          8          13          0
2    4     0     8          6           3          0
3    4     0     9         25          12          0
4    4     0    10         12          12          0 



Unnamed: 0,Day,Date,Time,People_In,People_Out,EventType
count,176.0,176.0,176.0,176.0,176.0,176.0
mean,2.607955,12.039773,9.323864,13.647727,14.801136,0.517045
std,1.839175,7.744019,7.29796,9.398528,9.666449,0.77071
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,6.0,3.0,7.75,7.0,0.0
50%,3.0,9.0,8.0,12.0,13.0,0.0
75%,4.0,18.0,14.25,18.25,21.0,1.0
max,5.0,26.0,30.0,49.0,55.0,2.0


In [7]:
# Prep the data for SVM
X_bin = bin_data_dummies.values[:,:-1]
y_bin = bin_data_dummies.IsEvent.values

X_mult = mult_data_dummies.values[:,:-1]
y_mult = mult_data_dummies.EventType.values

print "Binary Data"
print "X: \n", X_bin[:5]
print "X shape: ", X_bin.shape
print "y: ", y_bin[:5]
print "y shape: ", y_bin.shape
print "\n"
print "Multiclass Data"
print "X: \n", X_mult[:5]
print "X shape: ", X_mult.shape
print "y: ", y_mult[:5]
print "y shape: ", y_mult.shape

Binary Data
X: 
[[3 0 0 0 0]
 [3 0 1 1 0]
 [3 0 2 0 0]
 [3 0 3 0 0]
 [3 0 4 0 0]]
X shape:  (5040, 5)
y:  [1 1 1 1 1]
y shape:  (5040,)


Multiclass Data
X: 
[[ 4  0  6  9 26]
 [ 4  0  7  8 13]
 [ 4  0  8  6  3]
 [ 4  0  9 25 12]
 [ 4  0 10 12 12]]
X shape:  (176, 5)
y:  [0 0 0 0 0]
y shape:  (176,)


In [8]:
params = {'kernel': 'rbf', 'probability': True, 'class_weight': 'balanced'} 
bin_data_classifier = SVC(**params)
bin_data_classifier.fit(X_bin, y_bin)
accuracy = sklearn.model_selection.cross_val_score(bin_data_classifier, X_bin, y_bin, scoring='accuracy', cv=3)
print "Accuracy of the classifier: " + str(round(100 * accuracy.mean(), 2)) + "%"

Accuracy of the classifier: 96.43%


In [9]:
params = {'kernel': 'rbf', 'probability': True, 'class_weight': 'balanced'} 
mult_data_classifier = SVC(**params)
mult_data_classifier.fit(X_mult, y_mult)
accuracy = sklearn.model_selection.cross_val_score(mult_data_classifier, X_mult, y_mult, scoring='accuracy', cv=3)
print "Accuracy of the classifier: " + str(round(100 * accuracy.mean(), 2)) + "%"

Accuracy of the classifier: 64.77%


In [12]:
# I feel like I am paying for using Pandas here, Ouch!
# A lot of work just for transforming ONE point

bin_input_data = pd.DataFrame(data=None, columns=bin_data.columns, index=bin_data.index)
bin_input_data.Day = 'Tuesday'
bin_input_data.Date = '10/04/05'
bin_input_data.Time = '12:30:00'
bin_input_data.People_In = 21
bin_input_data.People_Out = 23

mult_input_data = pd.DataFrame(data=None, columns=mult_data.columns, index=mult_data.index)
mult_input_data.Day = bin_input_data.Day
mult_input_data.Date = bin_input_data.Date
mult_input_data.Time = bin_input_data.Time
mult_input_data.People_In = bin_input_data.People_In
mult_input_data.People_Out = bin_input_data.People_Out

bin_input_data.Day = le_both_day.transform(bin_input_data.Day)
bin_input_data.Date = le_bin_date.transform(bin_input_data.Date)
bin_input_data.Time = le_both_time.transform(bin_input_data.Time)

mult_input_data.Day = le_both_day.transform(mult_input_data.Day)
mult_input_data.Date = le_mult_date.transform(mult_input_data.Date)
mult_input_data.Time = le_both_time.transform(mult_input_data.Time)

X_input_bin = bin_input_data.values[0,:-1].reshape(1, -1)

X_input_mult = mult_input_data.values[0,:-1].reshape(1, -1)

print X_input_bin

 [[4 72 8 21 23]]


In [13]:
output_class_bin = bin_data_classifier.predict(X_input_bin)
output_class_mult = mult_data_classifier.predict(X_input_mult)
print "Output class:", le_bin_isevent.inverse_transform(output_class_bin)[0]
print "Output class:", le_mult_eventtype.inverse_transform(output_class_mult)[0]

Output class: noevent
Output class: eventA
