In [None]:
!pip install gcsfs

In [16]:
# For scientific calculations
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
import pandas as pd
import gcsfs

In [17]:
fs = gcsfs.GCSFileSystem(project='omina-gcp-resource')
with fs.open('omina-test-set/occupancy-data/occupancy_data.csv') as f:
    df = pd.read_csv(f)

In [21]:
import google.datalab.storage as storage
import pandas as pd
from io import BytesIO

mybucket = storage.Bucket('omina-test-set')
data_csv = mybucket.object('occupancy-data/occupancy_data.csv')

uri = data_csv.uri
%gcs read --object $uri --variable data

df = pd.read_csv(BytesIO(data))
# df.head()

In [22]:
df['date'] = pd.to_datetime(df['date'])
df.index = df['date']
del df['Unnamed: 0']

In [23]:
df = df.sort_values(by='date')
df_group = df.groupby(pd.Grouper(key = 'date', freq = 'D'))
df['NSM'] = df.date.apply(lambda x: x - x.replace(hour=0, minute=0, second=0)).dt.total_seconds()
df['WS'] = ((pd.DatetimeIndex(df.index).dayofweek) // 5 == 1).astype(int)
df.head()

Unnamed: 0_level_0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,NSM,WS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-02-02 14:19:00,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1,51540.0,0
2015-02-02 14:19:59,2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1,51599.0,0
2015-02-02 14:21:00,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1,51660.0,0
2015-02-02 14:22:00,2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1,51720.0,0
2015-02-02 14:23:00,2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1,51780.0,0


In [111]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
df['2015-02-03' : '2015-02-04'].plot(subplots=True, figsize=(15,8), linewidth=5, fontsize=20)
plt.legend(loc='best')

In [113]:
corr = df.corr()

In [None]:
sns.heatmap(corr, annot="True", cmap="YlGnBu")

In [None]:
sns.pairplot(df)

In [117]:
df.isnull().values.any()

False

In [24]:
y = df["Occupancy"]
X = df.drop("Occupancy", axis=1)

In [27]:
df_train = df['2015-02-04 17:51:00' : '2015-02-10 09:33:00']

df_test = df['2015-02-02 14:19:00' : '2015-02-04 10:43:00']

df_test1 = df['2015-02-11 14:48:00' : '2015-02-18 09:19:00']

In [28]:
feature_names = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'NSM', 'WS']
X_train = df_train[feature_names]
y_train = df_train['Occupancy']
X_test = df_test[feature_names]
y_test = df_test['Occupancy']
X_test1 = df_test1[feature_names]
y_test1 = df_test1['Occupancy']

Training the model with classifiers and calculating the mean accuracy

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [36]:
# Random Forest in train data #  
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Random Forest in train data: ",round(acc_random_forest,2), "%")

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Logistic Regression in train data: ",round(acc_log,2), "%")

#Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Decision Tree in train data: ",round(acc_decision_tree,2), "%")

# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Gaussian Naive Bayes in train data: ",round(acc_gaussian,2), "%")


('Mean Accuracy for Random Forest in train data: ', 100.0, '%')
('Mean Accuracy for Logistic Regression in train data: ', 98.32, '%')
('Mean Accuracy for Decision Tree in train data: ', 100.0, '%')
('Mean Accuracy for Gaussian Naive Bayes in train data: ', 97.89, '%')


In [45]:
#Random Forest prediction
y_pred_random_forest = random_forest.predict(X_test)
#Decision Tree Prediction
y_pred_decision_tree = decision_tree.predict(X_test)
# Logistic Regression Prediction
y_pred_logistic_regression = logreg.predict(X_test)

In [46]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_random_forest)
print('Confusion Matrix for Random Forest: ')
print(cm)

Confusion Matrix for Random Forest: 
[[1644   49]
 [  58  914]]


In [47]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_decision_tree)
print('Confusion Matrix for Decision Tree: ')
print(cm)

Confusion Matrix for Decision Tree: 
[[1642   51]
 [  93  879]]


In [49]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_logistic_regression)
print('Confusion Matrix for Logistic Regression: ')
print(cm)

Confusion Matrix for Logistic Regression: 
[[1638   55]
 [   3  969]]


In [55]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
#Calculate F1 Score for Random Forest
f1_score_random_forest = f1_score(y_test, y_pred_random_forest, pos_label=1)
print("F1 Score for Random Forest : ", f1_score_random_forest)
#Calculate F1 Score for Decision Tree
f1_score_decision_tree = f1_score(y_test, y_pred_decision_tree, pos_label=1)
print("\n")
print("F1 Score for Decision Tree : ", f1_score_decision_tree)
print("\n")
#Calculate F1 Score for Logistic Regression
f1_score_logistic_regression = f1_score(y_test, y_pred_logistic_regression, pos_label=1)
print("F1 Score for Decision Tree : ", f1_score_logistic_regression)

('F1 Score for Random Forest : ', 0.944702842377261)


('F1 Score for Decision Tree : ', 0.9242902208201893)


('F1 Score for Decision Tree : ', 0.9709418837675351)
