In [1]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv("output.csv")

# Convert the "start_timestamp" column to numerical format
data["start_timestamp"] = pd.to_datetime(data["start_timestamp"]).astype(int) / 10**9

# Print the first few rows of the data
print(data.head())


   room_id  start_timestamp  number_of_astronauts
0        1     1.641026e+09                     5
1        1     1.641027e+09                     7
2        1     1.641029e+09                     3
3        1     1.641030e+09                     3
4        1     1.641112e+09                     5


In [2]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the number of rows in the training and testing sets
print("Number of rows in training set:", len(train_data))
print("Number of rows in testing set:", len(test_data))


Number of rows in training set: 7235
Number of rows in testing set: 1809


In [3]:
from sklearn.ensemble import RandomForestRegressor

# Define the features and target variable
features = ["start_timestamp"]
target = "number_of_astronauts"

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(train_data[features], train_data[target])


RandomForestRegressor(random_state=42)

In [4]:
import numpy as np

# Define the time intervals
start_time = data["start_timestamp"].min()
end_time = data["start_timestamp"].max()
time_intervals = np.arange(start_time, end_time, 15 * 60)

# Create a DataFrame with the time intervals
predictions = pd.DataFrame({
    "start_timestamp": time_intervals
})

# Convert the "start_timestamp" column to numerical format
predictions["start_timestamp"] = predictions["start_timestamp"].astype(int)

# Make predictions for each time interval
predictions["number_of_astronauts"] = model.predict(predictions[features])

# Convert the "start_timestamp" column back to datetime format
predictions["start_timestamp"] = pd.to_datetime(predictions["start_timestamp"], unit="s")

# Print the predictions
print(predictions.head())


      start_timestamp  number_of_astronauts
0 2022-01-01 05:30:00              4.049762
1 2022-01-01 05:45:00              4.049762
2 2022-01-01 06:00:00              4.049762
3 2022-01-01 06:15:00              4.520429
4 2022-01-01 06:30:00              4.602095


In [5]:
# Define the start and end dates for the export
start_date = pd.to_datetime("2023-01-01")
end_date = pd.to_datetime("2023-12-31")

# Filter the predictions for the date range
export_data = predictions[(predictions["start_timestamp"] >= start_date) & (predictions["start_timestamp"] <= end_date)]

# Export the predictions to a CSV file
export_data.to_csv("predictions.csv", index=False)
