# **Reading files and shuffling data in the files**

## Reading Files

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read in txt file of training and testing record
train_df_occ = pd.read_csv("train_df_occ.csv")
test_df_occ = pd.read_csv("test_df_occ.csv")

test_df = pd.read_csv("test_df.csv")

#View briefly train dataset
train_df_occ.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,19.39,25.89,0.0,471.0,0.003599,0
1,20.5,33.5,0.0,645.0,0.005,0
2,20.2,19.1,0.0,447.0,0.002789,0
3,21.29,33.145,464.0,1421.75,0.005195,1
4,20.5,31.163333,441.0,970.333333,0.004649,1


In [3]:
test_df_occ.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,20.718,25.1,0.0,512.4,0.00379,0
1,21.0,23.29,0.0,509.333333,0.003577,0
2,21.6,28.39,0.0,895.0,0.00453,0
3,23.0,26.2,651.666667,1006.166667,0.004553,1
4,22.39,24.912,418.6,782.8,0.004169,1


In [4]:
train_df_occ.info()
train_df_occ.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8143 entries, 0 to 8142
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Temperature    8143 non-null   float64
 1   Humidity       8143 non-null   float64
 2   Light          8143 non-null   float64
 3   CO2            8143 non-null   float64
 4   HumidityRatio  8143 non-null   float64
 5   Occupancy      8143 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 381.8 KB


Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
count,8143.0,8143.0,8143.0,8143.0,8143.0,8143.0
mean,20.619084,25.731507,119.519375,606.546243,0.003863,0.21233
std,1.016916,5.531211,194.755805,314.320877,0.000852,0.408982
min,19.0,16.745,0.0,412.75,0.002674,0.0
25%,19.7,20.2,0.0,439.0,0.003078,0.0
50%,20.39,26.2225,0.0,453.5,0.003801,0.0
75%,21.39,30.533333,256.375,638.833333,0.004352,0.0
max,23.18,39.1175,1546.333333,2028.5,0.006476,1.0


## Preparing Train and Test datasets

### train_df_occ = train dataset with occupancy
### train_df = train dataset WITHOUT occupancy

### test_df_occ = test dataset with occupancy
### test_df = test WITHOUT occupancy

In [5]:
# Extract label class occupancy from training set
occlabel_df = train_df_occ['Occupancy']
occlabel_df.head()

0    0
1    0
2    0
3    1
4    1
Name: Occupancy, dtype: int64

In [6]:
# Remove label class from training set
train_df = train_df_occ.drop(columns=['Occupancy'])
train_df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
0,19.39,25.89,0.0,471.0,0.003599
1,20.5,33.5,0.0,645.0,0.005
2,20.2,19.1,0.0,447.0,0.002789
3,21.29,33.145,464.0,1421.75,0.005195
4,20.5,31.163333,441.0,970.333333,0.004649


In [7]:
test_df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
0,20.718,25.1,0.0,512.4,0.00379
1,21.0,23.29,0.0,509.333333,0.003577
2,21.6,28.39,0.0,895.0,0.00453
3,23.0,26.2,651.666667,1006.166667,0.004553
4,22.39,24.912,418.6,782.8,0.004169


In [8]:
# Transfer type of ddtaframe to numpy array for training and test sets and label set
train_array = np.array(train_df)
print(train_array[0:5])
test_array = np.array(test_df)
print(test_array[0:5])
occlabel_array = np.array(occlabel_df)
print(occlabel_array[0:5])

[[1.93900000e+01 2.58900000e+01 0.00000000e+00 4.71000000e+02
  3.59944168e-03]
 [2.05000000e+01 3.35000000e+01 0.00000000e+00 6.45000000e+02
  5.00026877e-03]
 [2.02000000e+01 1.91000000e+01 0.00000000e+00 4.47000000e+02
  2.78867884e-03]
 [2.12900000e+01 3.31450000e+01 4.64000000e+02 1.42175000e+03
  5.19526883e-03]
 [2.05000000e+01 3.11633333e+01 4.41000000e+02 9.70333333e+02
  4.64888703e-03]]
[[2.07180000e+01 2.51000000e+01 0.00000000e+00 5.12400000e+02
  3.78983396e-03]
 [2.10000000e+01 2.32900000e+01 0.00000000e+00 5.09333333e+02
  3.57688025e-03]
 [2.16000000e+01 2.83900000e+01 0.00000000e+00 8.95000000e+02
  4.53045755e-03]
 [2.30000000e+01 2.62000000e+01 6.51666667e+02 1.00616667e+03
  4.55297916e-03]
 [2.23900000e+01 2.49120000e+01 4.18600000e+02 7.82800000e+02
  4.16930286e-03]]
[0 0 0 1 1]


# **Random Forest Classification**

### train_array = train dataset WITHOUT occupancy

### test_array = test dataset WITHOUT occupancy
### occlabel_array = 'Occupancy' label from train dataset

In [9]:
#Random Forest Classification
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier()
clf.fit(train_array,occlabel_array)


RandomForestClassifier()

In [10]:
#To predict occupancy using the test's set features
clf_pred=clf.predict(test_array)
print(clf_pred)

[0 0 0 ... 0 1 1]


In [11]:
# Transfer the array of predicted popularity of test data into dataframe
clf_pred_df = pd.DataFrame(data=clf_pred, columns=['Occupancy'])
clf_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,0


In [12]:
RFprob_each = clf.predict_proba(test_array)
RFprob_each

array([[0.93, 0.07],
       [0.95, 0.05],
       [0.97, 0.03],
       ...,
       [0.99, 0.01],
       [0.14, 0.86],
       [0.02, 0.98]])

In [13]:
pos_prob = []
for inner in RFprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.07, 0.05, 0.03, 0.08, 0.67, 0.97, 0.04, 1.0, 0.05, 0.01]

In [14]:
# Insert result into test_wtpopl, check the prediction correctness
test_df_occ.insert(6, 'RForest', clf_pred_df['Occupancy'])

In [15]:
test_df_occ.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,RForest
0,20.718,25.1,0.0,512.4,0.00379,0,0
1,21.0,23.29,0.0,509.333333,0.003577,0,0
2,21.6,28.39,0.0,895.0,0.00453,0,0
3,23.0,26.2,651.666667,1006.166667,0.004553,1,0
4,22.39,24.912,418.6,782.8,0.004169,1,1


# **Naive Bayes**

In [16]:
from sklearn.naive_bayes import GaussianNB

gaus = GaussianNB()
gaus.fit(train_array, occlabel_array)

GaussianNB()

In [17]:
# To predict popularity using test set's features
NB_pred= gaus.predict(test_array)
print(NB_pred)

[0 0 0 ... 0 1 1]


In [18]:
# Transfer the array of predicted popularity of test data into dataframe
NB_pred_df = pd.DataFrame(data=NB_pred, columns=['Occupancy'])
NB_pred_df.head(10)

Unnamed: 0,Occupancy
0,0
1,0
2,0
3,1
4,1
5,1
6,0
7,1
8,0
9,0


In [19]:
NBprob_each = gaus.predict_proba(test_array)
NBprob_each

array([[1.00000000e+00, 7.56127438e-28],
       [1.00000000e+00, 1.50335414e-27],
       [1.00000000e+00, 5.78086413e-25],
       ...,
       [1.00000000e+00, 6.11733779e-26],
       [2.86429338e-04, 9.99713571e-01],
       [3.68655426e-06, 9.99996313e-01]])

In [20]:
pos_prob = []
for inner in NBprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[7.5612743766558035e-28,
 1.5033541399112353e-27,
 5.780864125403542e-25,
 0.9999999986607833,
 0.9999867676628953,
 0.9999999999997851,
 3.760977597995607e-27,
 0.9999999994984936,
 9.761362252782035e-28,
 1.0947216864358288e-27]

In [21]:
# Insert result into test_wtpopl, check the prediction correctness
test_df_occ.insert(6, 'NBGaus', NB_pred_df['Occupancy'])

In [22]:
test_df_occ.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,NBGaus,RForest
0,20.718,25.1,0.0,512.4,0.00379,0,0,0
1,21.0,23.29,0.0,509.333333,0.003577,0,0,0
2,21.6,28.39,0.0,895.0,0.00453,0,0,0
3,23.0,26.2,651.666667,1006.166667,0.004553,1,1,0
4,22.39,24.912,418.6,782.8,0.004169,1,1,1


# **Evaluation of Accuracy Score for Classifiers**

In [23]:
from sklearn.metrics import accuracy_score

y_true = test_df_occ['Occupancy']
y_true.head()

0    0
1    0
2    0
3    1
4    1
Name: Occupancy, dtype: int64

In [24]:
# The result data used to evaluate is from 'classifer_name' in test_wtpopl
y_pred = test_df_occ['NBGaus']
y_pred.head()

0    0
1    0
2    0
3    1
4    1
Name: NBGaus, dtype: int64

In [25]:
accuracy_score(y_true, y_pred)

0.9774859287054409

In [26]:
accuracy_score(y_true, y_pred, normalize=False)

2605

In [27]:
print("hello")

hello
