# **Reading files and shuffling data in the files**

## Reading Files

In [39]:
import pandas as pd
import numpy as np

In [40]:
#Read in txt file of training and testing record
train_df_occ = pd.read_csv("train_df_occ.csv")
test_df_occ = pd.read_csv("test_df_occ.csv")

test_df = pd.read_csv("test_df.csv")

#View briefly train dataset
train_df_occ.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,19.39,31.2,0.0,434.5,0.004343,0
1,20.1,18.79,0.0,439.0,0.002726,0
2,19.245,31.55,0.0,439.0,0.004352,0
3,20.29,32.9,0.0,482.0,0.004846,0
4,19.7,19.39,0.0,439.0,0.002744,0


In [41]:
test_df_occ.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,21.0,26.15,0.0,608.75,0.004019,0
1,20.29,22.6,0.0,435.2,0.003321,0
2,20.315,22.7675,0.0,434.5,0.003351,0
3,20.5,22.39,0.0,435.0,0.003333,0
4,20.79,24.79,433.0,672.666667,0.003759,1


In [42]:
train_df_occ.info()
train_df_occ.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8143 entries, 0 to 8142
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Temperature    8143 non-null   float64
 1   Humidity       8143 non-null   float64
 2   Light          8143 non-null   float64
 3   CO2            8143 non-null   float64
 4   HumidityRatio  8143 non-null   float64
 5   Occupancy      8143 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 381.8 KB


Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
count,8143.0,8143.0,8143.0,8143.0,8143.0,8143.0
mean,20.619084,25.731507,119.519375,606.546243,0.003863,0.21233
std,1.016916,5.531211,194.755805,314.320877,0.000852,0.408982
min,19.0,16.745,0.0,412.75,0.002674,0.0
25%,19.7,20.2,0.0,439.0,0.003078,0.0
50%,20.39,26.2225,0.0,453.5,0.003801,0.0
75%,21.39,30.533333,256.375,638.833333,0.004352,0.0
max,23.18,39.1175,1546.333333,2028.5,0.006476,1.0


## Preparing Train and Test datasets

### train_df_occ = train dataset with occupancy
### train_df = train dataset WITHOUT occupancy

### test_df_occ = test dataset with occupancy
### test_df = test WITHOUT occupancy

In [43]:
y_test = test_df_occ['Occupancy']
y_test.head()

0    0
1    0
2    0
3    0
4    1
Name: Occupancy, dtype: int64

In [44]:
# Extract label class occupancy from training set
occlabel_df = train_df_occ['Occupancy'] #y_train
occlabel_df.head()

0    0
1    0
2    0
3    0
4    0
Name: Occupancy, dtype: int64

In [45]:
# Remove label class from training set
train_df = train_df_occ.drop(columns=['Occupancy'])
train_df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
0,19.39,31.2,0.0,434.5,0.004343
1,20.1,18.79,0.0,439.0,0.002726
2,19.245,31.55,0.0,439.0,0.004352
3,20.29,32.9,0.0,482.0,0.004846
4,19.7,19.39,0.0,439.0,0.002744


In [46]:
test_df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
0,21.0,26.15,0.0,608.75,0.004019
1,20.29,22.6,0.0,435.2,0.003321
2,20.315,22.7675,0.0,434.5,0.003351
3,20.5,22.39,0.0,435.0,0.003333
4,20.79,24.79,433.0,672.666667,0.003759


In [47]:
test_df

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
0,21.000,26.150000,0.000000,608.750000,0.004019
1,20.290,22.600000,0.000000,435.200000,0.003321
2,20.315,22.767500,0.000000,434.500000,0.003351
3,20.500,22.390000,0.000000,435.000000,0.003333
4,20.790,24.790000,433.000000,672.666667,0.003759
...,...,...,...,...,...
2660,21.000,25.390000,444.000000,718.000000,0.003901
2661,20.890,24.890000,0.000000,550.000000,0.003798
2662,22.600,25.525000,433.500000,882.500000,0.004328
2663,22.500,26.963333,641.333333,1033.250000,0.004546


In [48]:
# Transfer type of ddtaframe to numpy array for training and test sets and label set
X_train = np.array(train_df)
print(X_train[0:5])
X_test = np.array(test_df)
print(X_test[0:5])
y_train = np.array(occlabel_df)
print(y_train[0:5])

[[1.93900000e+01 3.12000000e+01 0.00000000e+00 4.34500000e+02
  4.34283617e-03]
 [2.01000000e+01 1.87900000e+01 0.00000000e+00 4.39000000e+02
  2.72622321e-03]
 [1.92450000e+01 3.15500000e+01 0.00000000e+00 4.39000000e+02
  4.35214376e-03]
 [2.02900000e+01 3.29000000e+01 0.00000000e+00 4.82000000e+02
  4.84624669e-03]
 [1.97000000e+01 1.93900000e+01 0.00000000e+00 4.39000000e+02
  2.74445413e-03]]
[[2.10000000e+01 2.61500000e+01 0.00000000e+00 6.08750000e+02
  4.01895738e-03]
 [2.02900000e+01 2.26000000e+01 0.00000000e+00 4.35200000e+02
  3.32093209e-03]
 [2.03150000e+01 2.27675000e+01 0.00000000e+00 4.34500000e+02
  3.35087850e-03]
 [2.05000000e+01 2.23900000e+01 0.00000000e+00 4.35000000e+02
  3.33308443e-03]
 [2.07900000e+01 2.47900000e+01 4.33000000e+02 6.72666667e+02
  3.75947872e-03]]
[0 0 0 0 0]


# **Random Forest Classification**

### X_train = train dataset WITHOUT occupancy

### X_test = test dataset WITHOUT occupancy
### y_train = 'Occupancy' label from train dataset

In [49]:
#Random Forest Classification
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier()
clf.fit(X_train,y_train)


RandomForestClassifier()

In [50]:
#To predict occupancy using the test's set features
clf_pred=clf.predict(X_test)
print(clf_pred)

[0 0 0 ... 1 1 1]


In [51]:
# Transfer the array of predicted popularity of test data into dataframe
clf_pred_df = pd.DataFrame(data=clf_pred, columns=['Occupancy'])
clf_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,1


In [52]:
RFprob_each = clf.predict_proba(X_test)
RFprob_each

array([[0.89, 0.11],
       [1.  , 0.  ],
       [1.  , 0.  ],
       ...,
       [0.43, 0.57],
       [0.02, 0.98],
       [0.08, 0.92]])

In [53]:
pos_prob = []
for inner in RFprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.11, 0.0, 0.0, 0.0, 0.97, 0.05, 0.0, 0.1, 0.0, 1.0]

In [54]:
# Insert result into test_wtpopl, check the prediction correctness
test_df_occ.insert(6, 'RForest', clf_pred_df['Occupancy'])

In [55]:
test_df_occ.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,RForest
0,21.0,26.15,0.0,608.75,0.004019,0,0
1,20.29,22.6,0.0,435.2,0.003321,0,0
2,20.315,22.7675,0.0,434.5,0.003351,0,0
3,20.5,22.39,0.0,435.0,0.003333,0,0
4,20.79,24.79,433.0,672.666667,0.003759,1,1


# **Naive Bayes**

In [56]:
from sklearn.naive_bayes import GaussianNB

gaus = GaussianNB()
gaus.fit(X_train, y_train)

GaussianNB()

In [57]:
# To predict popularity using test set's features
NB_pred= gaus.predict(X_test)
print(NB_pred)

[0 0 0 ... 1 1 1]


In [58]:
# Transfer the array of predicted popularity of test data into dataframe
NB_pred_df = pd.DataFrame(data=NB_pred, columns=['Occupancy'])
NB_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,1


In [59]:
NBprob_each = gaus.predict_proba(X_test)
NBprob_each

array([[1.00000000e+00, 3.18153644e-27],
       [1.00000000e+00, 1.35757883e-28],
       [1.00000000e+00, 1.48579287e-28],
       ...,
       [8.40105717e-07, 9.99999160e-01],
       [5.58773638e-10, 9.99999999e-01],
       [3.31017838e-11, 1.00000000e+00]])

In [60]:
pos_prob = []
for inner in NBprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[3.1815364386792845e-27,
 1.3575788346625578e-28,
 1.4857928730460305e-28,
 2.7402122165463186e-28,
 0.9996926369762421,
 1.9804236089662067e-25,
 1.2294353152168009e-28,
 2.2794406655064257e-27,
 2.8392492354582773e-28,
 0.9999999999940563]

In [61]:
# Insert result into test_wtpopl, check the prediction correctness
test_df_occ.insert(6, 'NBGaus', NB_pred_df['Occupancy'])

In [62]:
test_df_occ

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,NBGaus,RForest
0,21.000,26.150000,0.000000,608.750000,0.004019,0,0,0
1,20.290,22.600000,0.000000,435.200000,0.003321,0,0,0
2,20.315,22.767500,0.000000,434.500000,0.003351,0,0,0
3,20.500,22.390000,0.000000,435.000000,0.003333,0,0,0
4,20.790,24.790000,433.000000,672.666667,0.003759,1,1,1
...,...,...,...,...,...,...,...,...
2660,21.000,25.390000,444.000000,718.000000,0.003901,0,1,1
2661,20.890,24.890000,0.000000,550.000000,0.003798,0,0,0
2662,22.600,25.525000,433.500000,882.500000,0.004328,1,1,1
2663,22.500,26.963333,641.333333,1033.250000,0.004546,1,1,1


# **KNN Clasification**


In [63]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [64]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [65]:
# To predict popularity using test set's features
KNN_pred= knn.predict(X_test)
print(KNN_pred)

[0 0 0 ... 1 1 1]


In [66]:
# Transfer the array of predicted popularity of test data into dataframe
KNpredct_df = pd.DataFrame(data=KNN_pred, columns=['Occupancy'])
KNpredct_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,1


In [67]:
KNprob_each = knn.predict_proba(X_test)
KNprob_each

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       ...,
       [0.        , 1.        ],
       [0.42857143, 0.57142857],
       [0.        , 1.        ]])

In [68]:
pos_prob = []
for inner in KNprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]

[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]

In [69]:
# Insert result into test_wtpopl, check the prediction correctness
test_df_occ.insert(len(test_df_occ.columns), 'KNeigh', KNpredct_df['Occupancy'])

In [70]:
test_df_occ

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,NBGaus,RForest,KNeigh
0,21.000,26.150000,0.000000,608.750000,0.004019,0,0,0,0
1,20.290,22.600000,0.000000,435.200000,0.003321,0,0,0,0
2,20.315,22.767500,0.000000,434.500000,0.003351,0,0,0,0
3,20.500,22.390000,0.000000,435.000000,0.003333,0,0,0,0
4,20.790,24.790000,433.000000,672.666667,0.003759,1,1,1,1
...,...,...,...,...,...,...,...,...,...
2660,21.000,25.390000,444.000000,718.000000,0.003901,0,1,1,1
2661,20.890,24.890000,0.000000,550.000000,0.003798,0,0,0,0
2662,22.600,25.525000,433.500000,882.500000,0.004328,1,1,1,1
2663,22.500,26.963333,641.333333,1033.250000,0.004546,1,1,1,1


# **Decision Tree**

In [71]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [72]:
#To predict occupancy using the test's set features
model_pred=model.predict(X_test)
print(model_pred)

[0 0 0 ... 0 1 1]


In [73]:
# Transfer the array of predicted popularity of test data into dataframe
model_pred_df = pd.DataFrame(data=model_pred, columns=['Occupancy'])
model_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,1


In [74]:
DTprob_each = model.predict_proba(X_test)
DTprob_each

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [75]:
pos_prob = []
for inner in DTprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]

In [76]:
# Insert result into test_wtpopl, check the prediction correctness
test_df_occ.insert(len(test_df_occ.columns), 'DecisionT', clf_pred_df['Occupancy'])

In [77]:
test_df_occ

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,NBGaus,RForest,KNeigh,DecisionT
0,21.000,26.150000,0.000000,608.750000,0.004019,0,0,0,0,0
1,20.290,22.600000,0.000000,435.200000,0.003321,0,0,0,0,0
2,20.315,22.767500,0.000000,434.500000,0.003351,0,0,0,0,0
3,20.500,22.390000,0.000000,435.000000,0.003333,0,0,0,0,0
4,20.790,24.790000,433.000000,672.666667,0.003759,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
2660,21.000,25.390000,444.000000,718.000000,0.003901,0,1,1,1,1
2661,20.890,24.890000,0.000000,550.000000,0.003798,0,0,0,0,0
2662,22.600,25.525000,433.500000,882.500000,0.004328,1,1,1,1,1
2663,22.500,26.963333,641.333333,1033.250000,0.004546,1,1,1,1,1


# **Evaluation of Accuracy Score for Classifiers**

## Random Forest Scores

In [78]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,clf_pred)
#y_true = test_df_occ['Occupancy']
#y_true.head()
print(ac)

0.9500938086303939


## Naive Bayes Scores

In [79]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,NB_pred)
#y_true = test_df_occ['Occupancy']
#y_true.head()
print(ac)

0.9774859287054409


## KNN Scores

In [80]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,KNN_pred)
#y_true = test_df_occ['Occupancy']
#y_true.head()
print(ac)

0.9609756097560975


## Decision Tree Scores

In [81]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,model_pred)
#y_true = test_df_occ['Occupancy']
#y_true.head()
print(ac)

0.9069418386491557
