# **DATA PREPROCESSING**

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
data=pd.read_csv('data.csv')
data

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,20.200000,33.000000,0.0,444.00,0.004834,0
1,22.166667,21.463333,518.0,898.00,0.003540,1
2,20.200000,21.200000,0.0,439.00,0.003097,0
3,19.700000,19.500000,0.0,452.00,0.002760,0
4,19.890000,26.600000,0.0,441.00,0.003816,0
...,...,...,...,...,...,...
10803,19.390000,30.745000,0.0,436.00,0.004279,0
10804,20.290000,33.200000,0.0,454.00,0.004891,0
10805,19.600000,19.500000,13.5,454.50,0.002743,0
10806,21.500000,24.100000,0.0,612.75,0.003818,0


## Prepare train and test dataset

In [6]:
#creating our X feature variables and y outcome variable
#X = data.drop('diagnosis',axis=1)
X = data.drop('Occupancy',axis=1)
y = data['Occupancy']

In [7]:
#splitting our data into training and testing data set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

# **Random Forest Classification**

In [8]:
#Random Forest Classification
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier()
clf.fit(X_train,y_train)


RandomForestClassifier()

In [9]:
#To predict occupancy using the test's set features
clf_pred=clf.predict(X_test)
print(clf_pred)

[0 0 0 ... 0 0 0]


In [10]:
# Transfer the array of predicted popularity of test data into dataframe
clf_pred_df = pd.DataFrame(data=clf_pred, columns=['Occupancy'])
clf_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,0
9,1


In [11]:
RFprob_each = clf.predict_proba(X_test)
RFprob_each

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [12]:
pos_prob = []
for inner in RFprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.98]

In [13]:
X_test

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
5291,19.890000,26.2900,0.0,440.50,0.003771
8986,19.426667,26.8900,0.0,460.00,0.003748
1367,19.890000,19.0000,0.0,442.50,0.002721
4943,21.500000,19.4725,507.5,655.50,0.003081
9903,19.500000,26.3900,0.0,467.00,0.003695
...,...,...,...,...,...
7069,21.500000,20.7000,0.0,740.00,0.003277
2641,19.100000,31.4450,0.0,433.00,0.004298
5534,20.315000,33.0000,0.0,487.00,0.004869
1608,20.700000,25.0000,0.0,493.25,0.003770


In [14]:
X_testdf = X_test.copy()
X_testdf.insert(len(X_testdf.columns),'y_test Occupancy',y_test)
X_testdf=X_testdf.reset_index(drop=True)
X_testdf

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,y_test Occupancy
0,19.890000,26.2900,0.0,440.50,0.003771,0
1,19.426667,26.8900,0.0,460.00,0.003748,0
2,19.890000,19.0000,0.0,442.50,0.002721,0
3,21.500000,19.4725,507.5,655.50,0.003081,1
4,19.500000,26.3900,0.0,467.00,0.003695,0
...,...,...,...,...,...,...
2157,21.500000,20.7000,0.0,740.00,0.003277,0
2158,19.100000,31.4450,0.0,433.00,0.004298,0
2159,20.315000,33.0000,0.0,487.00,0.004869,0
2160,20.700000,25.0000,0.0,493.25,0.003770,0


In [15]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'RForest', clf_pred_df['Occupancy'])
X_testdf

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,y_test Occupancy,RForest
0,19.890000,26.2900,0.0,440.50,0.003771,0,0
1,19.426667,26.8900,0.0,460.00,0.003748,0,0
2,19.890000,19.0000,0.0,442.50,0.002721,0,0
3,21.500000,19.4725,507.5,655.50,0.003081,1,1
4,19.500000,26.3900,0.0,467.00,0.003695,0,0
...,...,...,...,...,...,...,...
2157,21.500000,20.7000,0.0,740.00,0.003277,0,0
2158,19.100000,31.4450,0.0,433.00,0.004298,0,0
2159,20.315000,33.0000,0.0,487.00,0.004869,0,0
2160,20.700000,25.0000,0.0,493.25,0.003770,0,0


# **Naive Bayes**

In [16]:
from sklearn.naive_bayes import GaussianNB

gaus = GaussianNB()
gaus.fit(X_train, y_train)

GaussianNB()

In [17]:
# To predict popularity using test set's features
NB_pred= gaus.predict(X_test)
print(NB_pred)

[0 0 0 ... 0 0 0]


In [18]:
# Transfer the array of predicted popularity of test data into dataframe
NB_pred_df = pd.DataFrame(data=NB_pred, columns=['Occupancy'])
NB_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,0
9,1


In [19]:
NBprob_each = gaus.predict_proba(X_test)
NBprob_each

array([[1.00000000e+00, 3.40573526e-13],
       [1.00000000e+00, 1.32983410e-13],
       [1.00000000e+00, 2.02730699e-13],
       ...,
       [1.00000000e+00, 2.13148885e-12],
       [1.00000000e+00, 2.09679370e-12],
       [1.00000000e+00, 2.27007486e-12]])

In [20]:
pos_prob = []
for inner in NBprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[3.405735260051437e-13,
 1.3298340963987958e-13,
 2.027306986286595e-13,
 0.9999981409042105,
 1.5377414229026886e-13,
 1.1697629437314746e-13,
 0.0020972231628014868,
 6.360807778718353e-05,
 2.288089355136886e-12,
 0.9999998506073672]

In [21]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'NBGaus', NB_pred_df['Occupancy'])
X_testdf

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,y_test Occupancy,RForest,NBGaus
0,19.890000,26.2900,0.0,440.50,0.003771,0,0,0
1,19.426667,26.8900,0.0,460.00,0.003748,0,0,0
2,19.890000,19.0000,0.0,442.50,0.002721,0,0,0
3,21.500000,19.4725,507.5,655.50,0.003081,1,1,1
4,19.500000,26.3900,0.0,467.00,0.003695,0,0,0
...,...,...,...,...,...,...,...,...
2157,21.500000,20.7000,0.0,740.00,0.003277,0,0,0
2158,19.100000,31.4450,0.0,433.00,0.004298,0,0,0
2159,20.315000,33.0000,0.0,487.00,0.004869,0,0,0
2160,20.700000,25.0000,0.0,493.25,0.003770,0,0,0


# **KNN Classification**

In [22]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [23]:
# To predict popularity using test set's features
KNN_pred= knn.predict(X_test)
print(KNN_pred)

[0 0 0 ... 0 0 0]


In [24]:
# Transfer the array of predicted popularity of test data into dataframe
KNpredct_df = pd.DataFrame(data=KNN_pred, columns=['Occupancy'])
KNpredct_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,0
9,1


In [25]:
KNprob_each = knn.predict_proba(X_test)
KNprob_each

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [26]:
pos_prob = []
for inner in KNprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]

In [27]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'KNeigh', KNpredct_df['Occupancy'])
X_testdf


Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,y_test Occupancy,RForest,NBGaus,KNeigh
0,19.890000,26.2900,0.0,440.50,0.003771,0,0,0,0
1,19.426667,26.8900,0.0,460.00,0.003748,0,0,0,0
2,19.890000,19.0000,0.0,442.50,0.002721,0,0,0,0
3,21.500000,19.4725,507.5,655.50,0.003081,1,1,1,1
4,19.500000,26.3900,0.0,467.00,0.003695,0,0,0,0
...,...,...,...,...,...,...,...,...,...
2157,21.500000,20.7000,0.0,740.00,0.003277,0,0,0,0
2158,19.100000,31.4450,0.0,433.00,0.004298,0,0,0,0
2159,20.315000,33.0000,0.0,487.00,0.004869,0,0,0,0
2160,20.700000,25.0000,0.0,493.25,0.003770,0,0,0,0


# **Decision Tree**

In [28]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [29]:
#To predict occupancy using the test's set features
model_pred=model.predict(X_test)
print(model_pred)

[0 0 0 ... 0 0 0]


In [30]:
# Transfer the array of predicted popularity of test data into dataframe
model_pred_df = pd.DataFrame(data=model_pred, columns=['Occupancy'])
model_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,0
9,1


In [31]:
DTprob_each = model.predict_proba(X_test)
DTprob_each

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [32]:
pos_prob = []
for inner in DTprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]

In [33]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'DecisionT', clf_pred_df['Occupancy'])
X_testdf


Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,y_test Occupancy,RForest,NBGaus,KNeigh,DecisionT
0,19.890000,26.2900,0.0,440.50,0.003771,0,0,0,0,0
1,19.426667,26.8900,0.0,460.00,0.003748,0,0,0,0,0
2,19.890000,19.0000,0.0,442.50,0.002721,0,0,0,0,0
3,21.500000,19.4725,507.5,655.50,0.003081,1,1,1,1,1
4,19.500000,26.3900,0.0,467.00,0.003695,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2157,21.500000,20.7000,0.0,740.00,0.003277,0,0,0,0,0
2158,19.100000,31.4450,0.0,433.00,0.004298,0,0,0,0,0
2159,20.315000,33.0000,0.0,487.00,0.004869,0,0,0,0,0
2160,20.700000,25.0000,0.0,493.25,0.003770,0,0,0,0,0


# **Evaluation for Classifiers**

In [34]:
#Import relevant metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [35]:
#RANDOM FOREST
acRF=accuracy_score(y_test,clf_pred)
fscoreRF=f1_score(y_test,clf_pred)
conmatrixRF = confusion_matrix(y_test,clf_pred)

#NAIVE BAYES
acNB=accuracy_score(y_test,NB_pred)
fscoreNB = f1_score(y_test,NB_pred)
conmatrixNB = confusion_matrix(y_test,NB_pred)

#K NEIGHBOURS
acKNN=accuracy_score(y_test,KNN_pred)
fscoreKNN=f1_score(y_test,KNN_pred)
conmatrixKNN = confusion_matrix(y_test,KNN_pred)

#DECISION TREE
acDTree = accuracy_score(y_test,model_pred)
fscoreDTree = f1_score(y_test,model_pred)
conmatrixDTree = confusion_matrix(y_test,model_pred)


In [36]:
df = pd.DataFrame(
    {
        "KNeigh": [acKNN, fscoreKNN, conmatrixKNN],
        "Naive Bayes": [acNB, fscoreNB, conmatrixNB],
        "Random Forest": [acRF, fscoreRF, conmatrixRF],
        "Decision Tree": [acDTree, fscoreDTree, conmatrixDTree]
    },
    columns=["KNeigh", "Naive Bayes", "Random Forest", "Decision Tree"],
    index = ["Accuracy", "Fscore", "Confusion Matrix"])
df



Unnamed: 0,KNeigh,Naive Bayes,Random Forest,Decision Tree
Accuracy,0.986586,0.96716,0.989362,0.987974
Fscore,0.972089,0.934683,0.977561,0.974708
Confusion Matrix,"[[1628, 22], [7, 505]]","[[1583, 67], [4, 508]]","[[1638, 12], [11, 501]]","[[1635, 15], [11, 501]]"
