In [None]:
import pandas as pd
import numpy as np
from hmmlearn import hmm

# Load the data
data = pd.read_csv('output_dataset.csv')

# Convert the timestamp to datetime format
data['start_timestamp'] = pd.to_datetime(data['start_timestamp'])

# Extract hour and minute from timestamp
data['hour'] = data['start_timestamp'].dt.hour
data['minute'] = data['start_timestamp'].dt.minute

data['room_id'] = data['room_id']

# Split the data into training, retrain, and testing sets
train_data = data.loc[data['start_timestamp'] < '2022-09-01']
retrain_data = data.loc[(data['start_timestamp'] >= '2022-09-01') & (data['start_timestamp'] < '2022-10-01')]
test_data = data.loc[(data['start_timestamp'] >= '2022-10-01')& (data['start_timestamp'] < '2022-12-02')]

# Convert the timestamp to numerical format
train_data['timestamp'] = (train_data['hour'] * 60) + train_data['minute']
retrain_data['timestamp'] = (retrain_data['hour'] * 60) + retrain_data['minute']
test_data['timestamp'] = (test_data['hour'] * 60) + test_data['minute']


In [24]:
print(train_data)

        room_id     start_timestamp  number_of_astronauts  hour  minute  \
0             5 2022-01-01 00:00:00                    18     0       0   
1             5 2022-01-01 00:15:00                    18     0      15   
2             5 2022-01-01 00:30:00                    18     0      30   
3             5 2022-01-01 00:45:00                    18     0      45   
4             5 2022-01-01 01:00:00                    18     1       0   
...         ...                 ...                   ...   ...     ...   
163103        3 2022-08-31 22:45:00                     0    22      45   
163104        3 2022-08-31 23:00:00                     0    23       0   
163105        3 2022-08-31 23:15:00                     0    23      15   
163106        3 2022-08-31 23:30:00                     0    23      30   
163107        3 2022-08-31 23:45:00                     0    23      45   

        timestamp  
0               0  
1              15  
2              30  
3              45  

In [14]:
# Train the HMM model
n_components = 10
model = hmm.GaussianHMM(n_components=n_components)
model.fit(train_data[['timestamp', 'number_of_astronauts']])


GaussianHMM(n_components=10)

In [None]:
# Retrain the model on the training and retraining data
full_train_data = pd.concat([train_data, retrain_data])
full_train_data['timestamp'] = (full_train_data['hour'] * 60) + full_train_data['minute']
model.fit(full_train_data[['timestamp','number_of_astronauts']])


In [15]:
# Define the time range for which we want to generate predictions
start_time = pd.Timestamp('2022-12-01 00:00:00')
end_time = pd.Timestamp('2022-12-01 23:59:59')
timestamps = pd.date_range(start_time, end_time, freq='15min')

# Generate predictions for each room at 15-minute intervals for a day
predictions = pd.DataFrame(columns=['room_id', 'start_timestamp', 'number_of_astronauts'])
for room_id in data['room_id'].unique():
    room_data = pd.DataFrame({'room_id': room_id, 'start_timestamp': timestamps})
    room_data['hour'] = room_data['start_timestamp'].dt.hour
    room_data['minute'] = room_data['start_timestamp'].dt.minute
    room_data['timestamp'] = (room_data['hour'] * 60) + room_data['minute']
    room_data['predicted_astronauts'] = model.predict(room_data[['timestamp']])
    room_data['predicted_astronauts'] = np.round(room_data['predicted_astronauts']).astype(int)
    predictions = pd.concat([predictions, room_data])

# Convert the timestamp back to datetime format
predictions['start_timestamp'] = pd.to_datetime(predictions['start_timestamp'])

# Save the predictions to a CSV file
predictions.to_csv('predictions.csv', index=False)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Calculate the MAE and MSE for the predictions
mae = mean_absolute_error(test_data['number_of_astronauts'], predictions['predicted_astronauts'])
mse = mean_squared_error(test_data['number_of_astronauts'], predictions['predicted_astronauts'])
print('MAE:', mae)
print('MSE:', mse)

In [None]:
# Plot the actual vs predicted number of astronauts for each room
for room_id in data['room_id'].unique():
    room_data = test_data.loc[test_data['room_id'] == room_id]
    room_predictions = predictions.loc[predictions['room_id'] == room_id]
    plt.plot(room_data['start_timestamp'], room_data['number_of_astronauts'], label='Actual')
    plt.plot(room_predictions['start_timestamp'], room_predictions['predicted_astronauts'], label='Predicted')
    plt.xlabel('Timestamp')
    plt.ylabel('Number of Astronauts')
    plt.title('Room ' + str(room_id))
    plt.legend()
    plt.show()