In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('gym_data.csv')

# Preprocess the data
X = data.drop(['number_people', 'date', 'timestamp'], axis=1)
y = data['number_people']

# Convert categorical variables to numeric
X = pd.get_dummies(X, columns=['day_of_week', 'month'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def train_rf_model(n_estimators):
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

# Train models with different n_estimators
n_estimators_list = [10, 50, 100, 200, 500]
mse_scores = []
r2_scores = []

for n_estimators in n_estimators_list:
    mse, r2 = train_rf_model(n_estimators)
    mse_scores.append(mse)
    r2_scores.append(r2)

# Visualize the results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(n_estimators_list, mse_scores, marker='o')
plt.title('Effect of n_estimators on MSE')
plt.xlabel('Number of estimators')
plt.ylabel('Mean Squared Error')

plt.subplot(1, 2, 2)
plt.plot(n_estimators_list, r2_scores, marker='o')
plt.title('Effect of n_estimators on R2 Score')
plt.xlabel('Number of estimators')
plt.ylabel('R2 Score')

plt.tight_layout()
plt.show()

# Print the results
for n_estimators, mse, r2 in zip(n_estimators_list, mse_scores, r2_scores):
    print(f"n_estimators: {n_estimators}, MSE: {mse:.4f}, R2 Score: {r2:.4f}")