# Classifier Prediction
Author: Andy Malinsky

In [179]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import keras
from keras.models import Sequential
from keras.layers import Bidirectional, GRU, Dense, Input, Dropout, Activation

# Setting seed for reproducibility
np.random.seed(24)

## Load and Prepare Data

In [2]:
# Load dataset
ro_data = pd.read_pickle("ro_data.pkl")
ro_data.head()

Unnamed: 0,Date,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,...,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count,Datetime,time_diff,is_large_gap,group
0,2017/12/22,10:49:41,24.94,24.75,24.56,25.38,121,34,53,40,...,0.06,390,0.769231,0,0,1,2017-12-22 10:49:41,NaT,False,1
1,2017/12/22,10:50:12,24.94,24.75,24.56,25.44,121,33,53,40,...,0.06,390,0.646154,0,0,1,2017-12-22 10:50:12,0 days 00:00:31,False,1
2,2017/12/22,10:50:42,25.0,24.75,24.5,25.44,121,34,53,40,...,0.06,390,0.519231,0,0,1,2017-12-22 10:50:42,0 days 00:00:30,False,1
3,2017/12/22,10:51:13,25.0,24.75,24.56,25.44,121,34,53,40,...,0.09,390,0.388462,0,0,1,2017-12-22 10:51:13,0 days 00:00:31,False,1
4,2017/12/22,10:51:44,25.0,24.75,24.56,25.44,121,34,54,40,...,0.06,390,0.253846,0,0,1,2017-12-22 10:51:44,0 days 00:00:31,False,1


In [194]:
# Using the second half of group 1 as testing data because it has a good distribution of occupancy counts
group_1_data = ro_data[ro_data['group'] == 1]
split_index = len(group_1_data) // 2  # Midpoint of group 1

# Define training and testing sets
test_data = group_1_data.iloc[split_index:]  # Second half of group 1 as test set
train_data = ro_data[~ro_data.index.isin(test_data.index)]  # All other data as training set

# Features selected based on correlation analysis in eda-data-clean.ipynb notebook
filtered_columns = [
    'S1_Temp', 'S1_Light', 
    'S1_Sound', 'S2_Sound', 'S3_Sound', 'S4_Sound',
    'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR',
]
target_column = ['Room_Occupancy_Count']
num_classes = len(ro_data['Room_Occupancy_Count'].unique())

X_train, y_train = train_data[filtered_columns], train_data[target_column]
X_test, y_test = test_data[filtered_columns], test_data[target_column]

# Reshape target variable to match LSTM output requirements
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Display the new training and test set sizes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7476, 10), (2653, 10), (7476, 1), (2653, 1))

In [200]:
# Create sequences for GRU model
def create_sequences(x_data, y_data, seq_length):
    seq_arrays = []
    seq_labs = []
    
    for start in range(0, len(x_data) - seq_length - 1):
        seq_arrays.append(x_data[start:start + seq_length])
        seq_labs.append(y_data[start + seq_length + 1][-1])

    # Convert to numpy arrays and floats for keras layers      
    seq_arrays = np.array(seq_arrays, dtype = object).astype(np.float32)
    seq_labs = np.array(seq_labs, dtype = object).astype(np.float32) 
    
    return seq_arrays, seq_labs

In [229]:
# Create sequences for model input
seq_length = 5 # define sequence length
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, seq_length)
X_test_seq, y_test_seq = create_sequences(X_train_scaled, y_train, seq_length)

y_train_seq = y_train_seq.reshape(-1, 1)
y_test_seq = y_test_seq.reshape(-1, 1)
X_train_seq.shape, y_train_seq.shape, X_test_seq.shape, y_test_seq.shape

((7470, 5, 10), (7470, 1), (7470, 5, 10), (7470, 1))

In [230]:
X_train_seq.shape, y_train_seq.shape

((7470, 5, 10), (7470, 1))

In [205]:
pd.DataFrame(y_test).value_counts()

0
0    1871
2     398
1     254
3     130
Name: count, dtype: int64

In [97]:
# View Occupancy Count Distributions
# print(y_train['Room_Occupancy_Count'].value_counts())
# print(y_test['Room_Occupancy_Count'].value_counts())

## Model Training

In [236]:
X_train_seq.shape

(7470, 5, 10)

In [232]:
# Define path to save model
model_path = 'BiGRU_model1.keras'

# Create a Bidirectional GRU model
model = Sequential()
model.add(Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2])))

model.add(Bidirectional(GRU(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(GRU(64)))
model.add(Dropout(0.2))

model.add(Dense(units=num_classes))
model.add(Activation(activation='softmax'))

# Compile the model
optimizer = keras.optimizers.Adam(learning_rate = 0.01)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])
model.summary()

In [233]:
# Train the model
history = model.fit(X_train_seq, y_train_seq, epochs=10)

Epoch 1/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 0.8043 - mse: 0.8043
Epoch 2/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.7873 - mse: 0.7873
Epoch 3/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.7736 - mse: 0.7736
Epoch 4/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.7408 - mse: 0.7408
Epoch 5/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.7837 - mse: 0.7837
Epoch 6/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.8156 - mse: 0.8156
Epoch 7/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.7933 - mse: 0.7933
Epoch 8/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.7580 - mse: 0.7580
Epoch 9/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms

In [234]:
print(history.history.keys())

dict_keys(['loss', 'mse'])


In [237]:
# Test the model
scores_test = model.evaluate(X_test_seq, y_test_seq, verbose=2)
print('\nMSE: {}'.format(scores_test[1]))

234/234 - 1s - 4ms/step - loss: 0.7826 - mse: 0.7826

MSE: 0.7826472520828247
