# Testing Algorithms on Room Occupancy Dataset
Dataset: https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+

In [1]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold

In [3]:
#data was in 3 different csv files
df1 = pd.read_csv('datasets/room1.csv')

In [6]:
df2 = pd.read_csv('datasets/room2.csv')

In [7]:
df3 = pd.read_csv('datasets/room3.csv')

In [15]:
#concatenating 3 dataframes together 
data = df1.append(df2, ignore_index=True).append(df3, ignore_index=True)

In [18]:
data = data.drop(data.query('Occupancy < 1').sample(frac=.65876152832).index)

In [27]:
#storing the label 
room_label = data['Occupancy']

#removing label from dataframe
data.drop(['Occupancy'], axis=1, inplace=True)

#dropping columns with missing rows 
data.dropna(inplace=True)

In [28]:
data.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio
0,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793
1,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783
2,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779
3,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772
4,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757


In [32]:
data['date'] = data['date'].str.replace('\D', '').astype(int)


In [34]:
#standardizing data
continuous_cols = ['date','Temperature','Humidity','Light','CO2','HumidityRatio']
features = data[continuous_cols]
scaler = StandardScaler().fit(features.values)
data[continuous_cols] = scaler.transform(features.values)

In [38]:
Y = room_label
X = data

In [39]:
#creating 5 sets of training and test data 
X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y, train_size=5000)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, Y, train_size=5000)
X3_train, X3_test, y3_train, y3_test = train_test_split(X, Y, train_size=5000)
X4_train, X4_test, y4_train, y4_test = train_test_split(X, Y, train_size=5000)
X5_train, X5_test, y5_train, y5_test = train_test_split(X, Y, train_size=5000)

In [40]:
#creating arrays of training and test sets 
train_X_sets = [X1_train,X2_train,X3_train,X4_train,X5_train]
test_X_sets = [X1_test,X2_test,X3_test,X4_test,X5_test]

#creating arrays of training and test sets 
train_y_sets = [y1_train,y2_train,y3_train,y4_train,y5_train]
test_y_sets = [y1_test,y2_test,y3_test,y4_test,y5_test]

### Logistic Regression

In [43]:
#creating a pipeline in order to grid search 
pipe = Pipeline([('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-8, 4, 11)},
                {'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-8, 4, 11)},
                {'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']}
                ]
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0)

In [44]:
#fitting the models with 5 fold cross validation
logreg_models = []

for i in range(5):
    logreg_models.append(clf.fit(train_X_sets[i],train_y_sets[i]))

In [45]:
#initializing array of training and test set predicted values
logreg_train_roc_pred = []
logreg_test_roc_pred = []

logreg_train_acc_pred = []
logreg_test_acc_pred = []

logreg_train_f1_pred = []
logreg_test_f1_pred = []

In [46]:
for i in range(5):
    proc = logreg_models[i].cv_results_['params'][ np.argmin(logreg_models[i].cv_results_['rank_test_roc_auc'])]
    pacc = logreg_models[i].cv_results_['params'][ np.argmin(logreg_models[i].cv_results_['rank_test_accuracy'])]
    pf1 = logreg_models[i].cv_results_['params'][ np.argmin(logreg_models[i].cv_results_['rank_test_f1'])]
    
    pipe.set_params(**proc)
    pipe.fit(train_X_sets[i],train_y_sets[i])

    logreg_train_roc_pred.append(pipe.predict(train_X_sets[i]))
    logreg_test_roc_pred.append(pipe.predict(test_X_sets[i]))
    
    pipe.set_params(**pacc)
    pipe.fit(train_X_sets[i],train_y_sets[i])
    
    logreg_train_acc_pred.append(pipe.predict(train_X_sets[i]))
    logreg_test_acc_pred.append(pipe.predict(test_X_sets[i]))
    
    pipe.set_params(**pf1)
    pipe.fit(train_X_sets[i],train_y_sets[i])
    
    logreg_train_f1_pred.append(pipe.predict(train_X_sets[i]))
    logreg_test_f1_pred.append(pipe.predict(test_X_sets[i]))

In [47]:
#initializing scores arrays for each metric, random forest on adult dataset
logreg_train_roc_scores = []
logreg_train_acc_scores = []
logreg_train_f1_scores = []
logreg_test_roc_scores = []
logreg_test_acc_scores = []
logreg_test_f1_scores = []

In [48]:
#populating the scores arrays 
for i in range(5):
    logreg_train_roc_scores.append(roc_auc_score(train_y_sets[i],logreg_train_roc_pred[i]))
    logreg_train_acc_scores.append(accuracy_score(train_y_sets[i],logreg_train_acc_pred[i]))
    logreg_train_f1_scores.append(f1_score(train_y_sets[i],logreg_train_f1_pred[i]))
    logreg_test_roc_scores.append(roc_auc_score(test_y_sets[i],logreg_test_roc_pred[i]))
    logreg_test_acc_scores.append(accuracy_score(test_y_sets[i],logreg_test_acc_pred[i]))
    logreg_test_f1_scores.append(f1_score(test_y_sets[i],logreg_test_f1_pred[i]))

In [49]:
#mean of each metric across trials
logreg_train_mean_roc = np.mean(logreg_train_roc_scores)
logreg_train_mean_acc = np.mean(logreg_train_acc_scores)
logreg_train_mean_f1 = np.mean(logreg_train_f1_scores)
logreg_test_mean_roc = np.mean(logreg_test_roc_scores)
logreg_test_mean_acc = np.mean(logreg_test_acc_scores)
logreg_test_mean_f1 = np.mean(logreg_test_f1_scores)

In [50]:
#mean across metrics 
logreg_metric_mean_test = np.mean([logreg_test_roc_scores,logreg_test_acc_scores,logreg_test_f1_scores])
logreg_metric_mean_train = np.mean([logreg_train_roc_scores,logreg_train_acc_scores,logreg_train_f1_scores])

### Random Forest

In [53]:
#creating a random forest object in order to grid search 
pipe2 =  RandomForestClassifier(criterion='entropy')

#setting the possible options for hyperparameters 
params = [{'n_estimators':[1024],'max_features':[1,2,4,6]}]

#creating a gridsearch object 
clf2 = GridSearchCV(pipe2, params, cv=StratifiedKFold(n_splits=5), scoring=['accuracy', 'roc_auc', 'f1'], refit=False, verbose=0)

In [58]:
#fitting the models with 5 fold cross validation
rf_models = []

for i in range(5):
    rf_models.append(clf2.fit(train_X_sets[i],train_y_sets[i]))

In [59]:
#initializing array of training and test set predicted values
rf_train_roc_pred = []
rf_test_roc_pred = []

rf_train_acc_pred = []
rf_test_acc_pred = []

rf_train_f1_pred = []
rf_test_f1_pred = []

In [60]:
for i in range(5):
    proc = rf_models[i].cv_results_['params'][ np.argmin(rf_models[i].cv_results_['rank_test_roc_auc'])]
    pacc = rf_models[i].cv_results_['params'][ np.argmin(rf_models[i].cv_results_['rank_test_accuracy'])]
    pf1 = rf_models[i].cv_results_['params'][ np.argmin(rf_models[i].cv_results_['rank_test_f1'])]
    
    pipe2.set_params(**proc)
    pipe2.fit(train_X_sets[i],train_y_sets[i])

    rf_train_roc_pred.append(pipe2.predict(train_X_sets[i]))
    rf_test_roc_pred.append(pipe2.predict(test_X_sets[i]))
    
    pipe2.set_params(**pacc)
    pipe2.fit(train_X_sets[i],train_y_sets[i])
    
    rf_train_acc_pred.append(pipe2.predict(train_X_sets[i]))
    rf_test_acc_pred.append(pipe2.predict(test_X_sets[i]))
    
    pipe2.set_params(**pf1)
    pipe2.fit(train_X_sets[i],train_y_sets[i])
    
    rf_train_f1_pred.append(pipe2.predict(train_X_sets[i]))
    rf_test_f1_pred.append(pipe2.predict(test_X_sets[i]))

In [61]:
#initializing scores arrays for each metric, random forest on adult dataset
rf_train_roc_scores = []
rf_train_acc_scores = []
rf_train_f1_scores = []
rf_test_roc_scores = []
rf_test_acc_scores = []
rf_test_f1_scores = []

In [62]:
#populating the scores arrays 
for i in range(5):
    rf_train_roc_scores.append(roc_auc_score(train_y_sets[i],rf_train_roc_pred[i]))
    rf_train_acc_scores.append(accuracy_score(train_y_sets[i],rf_train_acc_pred[i]))
    rf_train_f1_scores.append(f1_score(train_y_sets[i],rf_train_f1_pred[i]))
    rf_test_roc_scores.append(roc_auc_score(test_y_sets[i],rf_test_roc_pred[i]))
    rf_test_acc_scores.append(accuracy_score(test_y_sets[i],rf_test_acc_pred[i]))
    rf_test_f1_scores.append(f1_score(test_y_sets[i],rf_test_f1_pred[i]))

In [63]:
#mean of each metric across trials
rf_train_mean_roc = np.mean(rf_train_roc_scores)
rf_train_mean_acc = np.mean(rf_train_acc_scores)
rf_train_mean_f1 = np.mean(rf_train_f1_scores)
rf_test_mean_roc = np.mean(rf_test_roc_scores)
rf_test_mean_acc = np.mean(rf_test_acc_scores)
rf_test_mean_f1 = np.mean(rf_test_f1_scores)

In [64]:
#mean across metrics 
rf_metric_mean_test = np.mean([rf_test_roc_scores,rf_test_acc_scores,rf_test_f1_scores])
rf_metric_mean_train = np.mean([rf_train_roc_scores,rf_train_acc_scores,rf_train_f1_scores])

### kNN

In [65]:
#creating a k Nearest Neighbors object in order to grid search 
pipe3 =  KNeighborsClassifier()
step = 500/26
k = np.arange(1,500,step,dtype=int)
k_params = [{'n_neighbors':k,'weights':['uniform','distance'],'metric':['euclidean','manhattan']}]

clf3 = GridSearchCV(pipe3, k_params, cv=StratifiedKFold(n_splits=5), scoring=['accuracy', 'roc_auc', 'f1'], refit=False, verbose=0)

In [66]:
#fitting the models with 5 fold cross validation
knn_models = []

for i in range(5):
    knn_models.append(clf3.fit(train_X_sets[i],train_y_sets[i]))

In [67]:
#initializing array of training and test set predicted values
knn_train_roc_pred = []
knn_test_roc_pred = []

knn_train_acc_pred = []
knn_test_acc_pred = []

knn_train_f1_pred = []
knn_test_f1_pred = []

In [68]:
for i in range(5):
    proc = knn_models[i].cv_results_['params'][ np.argmin(knn_models[i].cv_results_['rank_test_roc_auc'])]
    pacc = knn_models[i].cv_results_['params'][ np.argmin(knn_models[i].cv_results_['rank_test_accuracy'])]
    pf1 = knn_models[i].cv_results_['params'][ np.argmin(knn_models[i].cv_results_['rank_test_f1'])]
    
    pipe3.set_params(**proc)
    pipe3.fit(train_X_sets[i],train_y_sets[i])

    knn_train_roc_pred.append(pipe3.predict(train_X_sets[i]))
    knn_test_roc_pred.append(pipe3.predict(test_X_sets[i]))
    
    pipe3.set_params(**pacc)
    pipe3.fit(train_X_sets[i],train_y_sets[i])
    
    knn_train_acc_pred.append(pipe3.predict(train_X_sets[i]))
    knn_test_acc_pred.append(pipe3.predict(test_X_sets[i]))
    
    pipe3.set_params(**pf1)
    pipe3.fit(train_X_sets[i],train_y_sets[i])
    
    knn_train_f1_pred.append(pipe3.predict(train_X_sets[i]))
    knn_test_f1_pred.append(pipe3.predict(test_X_sets[i]))

In [69]:
#initializing scores arrays for each metric, random forest on adult dataset
knn_train_roc_scores = []
knn_train_acc_scores = []
knn_train_f1_scores = []
knn_test_roc_scores = []
knn_test_acc_scores = []
knn_test_f1_scores = []

In [70]:
#populating the scores arrays 
for i in range(5):
    knn_train_roc_scores.append(roc_auc_score(train_y_sets[i],knn_train_roc_pred[i]))
    knn_train_acc_scores.append(accuracy_score(train_y_sets[i],knn_train_acc_pred[i]))
    knn_train_f1_scores.append(f1_score(train_y_sets[i],knn_train_f1_pred[i]))
    knn_test_roc_scores.append(roc_auc_score(test_y_sets[i],knn_test_roc_pred[i]))
    knn_test_acc_scores.append(accuracy_score(test_y_sets[i],knn_test_acc_pred[i]))
    knn_test_f1_scores.append(f1_score(test_y_sets[i],knn_test_f1_pred[i]))

In [71]:
#mean of each metric across trials
knn_train_mean_roc = np.mean(knn_train_roc_scores)
knn_train_mean_acc = np.mean(knn_train_acc_scores)
knn_train_mean_f1 = np.mean(knn_train_f1_scores)
knn_test_mean_roc = np.mean(knn_test_roc_scores)
knn_test_mean_acc = np.mean(knn_test_acc_scores)
knn_test_mean_f1 = np.mean(knn_test_f1_scores)

In [72]:
#mean across metrics 
knn_metric_mean_test = np.mean([knn_test_roc_scores,knn_test_acc_scores,knn_test_f1_scores])
knn_metric_mean_train = np.mean([knn_train_roc_scores,knn_train_acc_scores,knn_train_f1_scores])

In [128]:
pd.DataFrame(rf_models[2].cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,std_test_roc_auc,rank_test_roc_auc,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,3.451501,0.106281,0.389609,0.06922,1,1024,"{'max_features': 1, 'n_estimators': 1024}",0.993,0.994,0.99,...,0.000888,2,0.992639,0.993724,0.989518,0.991561,0.988433,0.991175,0.001952,1
1,4.89433,0.337298,0.378635,0.065,2,1024,"{'max_features': 2, 'n_estimators': 1024}",0.995,0.992,0.99,...,0.000779,1,0.994742,0.991649,0.989518,0.990496,0.988433,0.990968,0.002166,2
2,6.513855,0.505217,0.320225,0.04185,4,1024,"{'max_features': 4, 'n_estimators': 1024}",0.996,0.989,0.99,...,0.001135,3,0.995789,0.988482,0.989518,0.990496,0.986301,0.990117,0.00316,4
3,8.246004,0.333807,0.298458,0.021756,6,1024,"{'max_features': 6, 'n_estimators': 1024}",0.996,0.989,0.99,...,0.000945,4,0.995789,0.988482,0.989518,0.990496,0.988433,0.990544,0.002731,3


In [138]:
logreg_models[1].cv_results_['params'][ np.argmin(logreg_models[i].cv_results_['rank_test_f1'])]

{'classifier': LogisticRegression(C=630.9573444801918, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=5000, multi_class='auto', n_jobs=None, penalty='l1',
                    random_state=None, solver='saga', tol=0.0001, verbose=0,
                    warm_start=False),
 'classifier__C': 630.9573444801918,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}