In [None]:
!pip install gcsfs

In [2]:
# For scientific calculations
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
import pandas as pd
import gcsfs

In [4]:
fs = gcsfs.GCSFileSystem(project='omina-gcp-resource')
with fs.open('omina-test-set/occupancy-data/occupancy_data.csv') as f:
    df = pd.read_csv(f)

In [5]:
import google.datalab.storage as storage
import pandas as pd
from io import BytesIO

mybucket = storage.Bucket('omina-test-set')
data_csv = mybucket.object('occupancy-data/occupancy_data.csv')

uri = data_csv.uri
%gcs read --object $uri --variable data

df = pd.read_csv(BytesIO(data))
# df.head()

In [6]:
df['date'] = pd.to_datetime(df['date'])
df.index = df['date']
del df['Unnamed: 0']

In [None]:
df = df.sort_values(by='date')
df_group = df.groupby(pd.Grouper(key = 'date', freq = 'D'))
df['NSM'] = df.date.apply(lambda x: x - x.replace(hour=0, minute=0, second=0)).dt.total_seconds()
df['WS'] = ((pd.DatetimeIndex(df.index).dayofweek) // 5 == 1).astype(int)
# df.head()

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
df['2015-02-03' : '2015-02-04'].plot(subplots=True, figsize=(15,8), linewidth=5, fontsize=20)
plt.legend(loc='best')

In [10]:
corr = df.corr()

In [None]:
sns.heatmap(corr, annot="True", cmap="YlGnBu")

In [None]:
sns.pairplot(df)

In [13]:
df.isnull().values.any()

False

In [14]:
y = df["Occupancy"]
X = df.drop("Occupancy", axis=1)

In [15]:
df_train = df['2015-02-04 17:51:00' : '2015-02-10 09:33:00']

df_test = df['2015-02-02 14:19:00' : '2015-02-04 10:43:00']

df_test1 = df['2015-02-11 14:48:00' : '2015-02-18 09:19:00']

In [16]:
feature_names = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'NSM', 'WS']
X_train = df_train[feature_names]
y_train = df_train['Occupancy']
X_test = df_test[feature_names]
y_test = df_test['Occupancy']
X_test1 = df_test1[feature_names]
y_test1 = df_test1['Occupancy']

Training the model with classifiers and calculating the mean accuracy

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [18]:
# Random Forest in train data #  
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Random Forest in train data: ",round(acc_random_forest,2), "%")

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Logistic Regression in train data: ",round(acc_log,2), "%")

#Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Decision Tree in train data: ",round(acc_decision_tree,2), "%")

# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
print("Mean Accuracy for Gaussian Naive Bayes in train data: ",round(acc_gaussian,2), "%")


('Mean Accuracy for Random Forest in train data: ', 100.0, '%')
('Mean Accuracy for Logistic Regression in train data: ', 98.32, '%')
('Mean Accuracy for Decision Tree in train data: ', 100.0, '%')
('Mean Accuracy for Gaussian Naive Bayes in train data: ', 97.89, '%')


In [25]:
#Random Forest prediction
y_pred_random_forest = random_forest.predict(X_test)
#Decision Tree Prediction
y_pred_decision_tree = decision_tree.predict(X_test)
# Logistic Regression Prediction
y_pred_logistic_regression = logreg.predict(X_test)
# Gaussian Naive Bayes Prediction
y_pred_gausian_naive_bayes = gaussian.predict(X_test)


In [26]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_gausian_naive_bayes)
print('Confusion Matrix for Gaussian Naive Bayes: ')
print(cm)

Confusion Matrix for Gaussian Naive Bayes: 
[[1638   55]
 [   5  967]]


In [21]:
cm = confusion_matrix(y_test, y_pred_decision_tree)
print('Confusion Matrix for Decision Tree: ')
print(cm)

Confusion Matrix for Decision Tree: 
[[1642   51]
 [ 150  822]]


In [22]:
cm = confusion_matrix(y_test, y_pred_logistic_regression)
print('Confusion Matrix for Logistic Regression: ')
print(cm)

Confusion Matrix for Logistic Regression: 
[[1638   55]
 [   3  969]]


In [23]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
#Calculate F1 Score for Random Forest
f1_score_random_forest = f1_score(y_test, y_pred_random_forest, pos_label=1)

print("F1 Score for Random Forest : ", f1_score_random_forest)
#Calculate F1 Score for Decision Tree
f1_score_decision_tree = f1_score(y_test, y_pred_decision_tree, pos_label=1)
print("\n")
print("F1 Score for Decision Tree : ", f1_score_decision_tree)
print("\n")
#Calculate F1 Score for Logistic Regression
f1_score_logistic_regression = f1_score(y_test, y_pred_logistic_regression, pos_label=1)
print("F1 Score for Decision Tree : ", f1_score_logistic_regression)
print("\n")
#Calculate F1 Score for Gaussian 
f1_score_gaussian = f1_score(y_test, y_pred_gausian_naive_bayes, pos_label=1)
print("F1 Score for Gaussian Naive Bays : ", f1_score_gaussian)

('F1 Score for Random Forest : ', 0.9414204250907204)


('F1 Score for Decision Tree : ', 0.8910569105691056)


('F1 Score for Decision Tree : ', 0.9709418837675351)


('F1 Score for Gaussian Naive Bays : ', 0.9709418837675351)


In [29]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_test, y_pred_gausian_naive_bayes, target_names=target_names))

             precision    recall  f1-score   support

    class 0       1.00      0.97      0.98      1693
    class 1       0.95      0.99      0.97       972

avg / total       0.98      0.98      0.98      2665



  .format(len(labels), len(target_names))


In [33]:
# Implement K-Fold cross validation for Random Classifier and Decision Tree
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

labels = df["Occupancy"]
train = df.drop("Occupancy", axis=1)
kf=KFold(n_splits=10, shuffle=True, random_state=False)
  
for train_id, test_id in kf.split(train,labels):
    print("train_id id :", train_id)
    XX_train, XX_test = train.values[train_id], train.values[test_id]
    yy_train, yy_test = labels.values[train_id], labels.values[test_id]
    gaussian.fit(XX_train,yy_train)

('train_id id :', array([    0,     1,     2, ..., 20557, 20558, 20559]))


TypeError: float() argument must be a string or a number