In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor

data = pd.read_csv('../data/df.csv')
   
data = data.dropna()

data = pd.get_dummies(data, columns=['target'], drop_first=False)

X = data[['month', 'cost', 'target_family', 'target_old', 'target_youth']]
y_visitors = data['visitors']
y_vicost = data['visit/cost'] 
X_train, X_test, y_train_visitors, y_test_visitors, y_train_vicost, y_test_vicost = train_test_split(
X, y_visitors, y_vicost, test_size=0.2, random_state=42)

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, r2_score

data = pd.read_csv('../data/df.csv')
data = data.dropna()

data = pd.get_dummies(data, columns=['target'], drop_first=False)

data['target_family'] = data['target_family'].astype(bool).astype(int)
data['target_old'] = data['target_old'].astype(bool).astype(int)
data['target_youth'] = data['target_youth'].astype(bool).astype(int)


X = data[['month', 'cost', 'target_family', 'target_old', 'target_youth']]
y_visitors = data['visitors']
y_vicost = data['visit/cost'] 

X_train, X_test, y_train_visitors, y_test_visitors, y_train_vicost, y_test_vicost = train_test_split(
    X, y_visitors, y_vicost, test_size=0.2, random_state=42)

y_test_vicost = np.nan_to_num(y_test_vicost, nan=0, posinf=0, neginf=0)

log_transformer = FunctionTransformer(np.log1p, validate=True)

rf_visitors = RandomForestRegressor(n_estimators=100, random_state=42)
rf_vicost = RandomForestRegressor(n_estimators=100, random_state=42)

ttr_visitors = TransformedTargetRegressor(
    regressor=rf_visitors, 
    transformer=log_transformer,
    check_inverse=False
)

ttr_vicost = TransformedTargetRegressor(
    regressor=rf_vicost, 
    transformer=log_transformer,
    check_inverse=False
)

ttr_visitors.fit(X_train, y_train_visitors)
ttr_vicost.fit(X_train, y_train_vicost)

y_pred_visitors = ttr_visitors.predict(X_test)
y_pred_vicost = ttr_vicost.predict(X_test)

y_pred_visitors_original = np.expm1(y_pred_visitors)
y_pred_vicost_original = np.expm1(y_pred_vicost)

rmse_visitors = np.sqrt(mean_squared_error(y_test_visitors, y_pred_visitors_original))
rmse_vicost = np.sqrt(mean_squared_error(y_test_vicost, y_pred_vicost_original))

r2_visitors = r2_score(y_test_visitors, y_pred_visitors_original)
r2_vicost = r2_score(y_test_vicost, y_pred_vicost_original)

print("Basic Data Statistics:")
print(f"Mean of visitors: {y_visitors.mean():.2f}, Std of visitors: {y_visitors.std():.2f}, Var of visitors: {y_visitors.var():.2f}")
print(f"Mean of vicost: {y_vicost.mean():.2f}, Std of vicost: {y_vicost.std():.2f}, Var of vicost: {y_vicost.var():.2f}")
print("\n")

print(f'RMSE for visitors model: {rmse_visitors:.2f}')
print(f'RMSE for vicost model: {rmse_vicost:.2f}')
print(f'R^2 for visitors model: {r2_visitors:.2f}')
print(f'R^2 for vicost model: {r2_vicost:.2f}')


Basic Data Statistics:
Mean of visitors: 186847.18, Std of visitors: 164854.73, Var of visitors: 27177080560.74
Mean of vicost: inf, Std of vicost: nan, Var of vicost: nan


RMSE for visitors model: 89126.17
RMSE for vicost model: 0.03
R^2 for visitors model: 0.35
R^2 for vicost model: -2518.85


In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [3, 4 ,5,6],
    'min_samples_split': [2, 5, 8, 10]}

grid_search_visitors = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search_vicost = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')

grid_search_visitors.fit(X_train, y_train_visitors)
grid_search_vicost.fit(X_train, y_train_vicost)

best_rf_visitors = grid_search_visitors.best_estimator_
best_rf_vicost = grid_search_vicost.best_estimator_

print("Best hyperparameters for visitors model:", grid_search_visitors.best_params_)
print("Best hyperparameters for vicost model:", grid_search_vicost.best_params_)

ttr_visitors = TransformedTargetRegressor(regressor=best_rf_visitors, transformer=log_transformer, check_inverse=False)
ttr_vicost = TransformedTargetRegressor(regressor=best_rf_vicost, transformer=log_transformer, check_inverse=False)

ttr_visitors.fit(X_train, y_train_visitors)
ttr_vicost.fit(X_train, y_train_vicost)

y_pred_visitors = ttr_visitors.predict(X_test)
y_pred_vicost = ttr_vicost.predict(X_test)

y_pred_visitors_original = np.expm1(y_pred_visitors)
y_pred_vicost_original = np.expm1(y_pred_vicost)

rmse_visitors = np.sqrt(mean_squared_error(y_test_visitors, y_pred_visitors_original))
rmse_vicost = np.sqrt(mean_squared_error(y_test_vicost, y_pred_vicost_original))

print(f'RMSE for visitors model: {rmse_visitors}')
print(f'RMSE for vicost model: {rmse_vicost}')


Best hyperparameters for visitors model: {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best hyperparameters for vicost model: {'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 150}
RMSE for visitors model: 92242.30828945633
RMSE for vicost model: 0.026810232053542788


In [24]:

rf_visitors = RandomForestRegressor(n_estimators=200, max_depth=4, min_samples_split=2, random_state=42)
rf_vicost = RandomForestRegressor(n_estimators=150, max_depth=3, min_samples_split=10, random_state=42)

ttr_visitors = TransformedTargetRegressor(
    regressor=rf_visitors, 
    transformer=log_transformer,
    check_inverse=False
)

ttr_vicost = TransformedTargetRegressor(
    regressor=rf_vicost, 
    transformer=log_transformer,
    check_inverse=False
)

ttr_visitors.fit(X_train, y_train_visitors)
ttr_vicost.fit(X_train, y_train_vicost)

y_pred_visitors = ttr_visitors.predict(X_test)
y_pred_vicost = ttr_vicost.predict(X_test)

y_pred_visitors_original = np.expm1(y_pred_visitors)
y_pred_vicost_original = np.expm1(y_pred_vicost)

rmse_visitors = np.sqrt(mean_squared_error(y_test_visitors, y_pred_visitors_original))
rmse_vicost = np.sqrt(mean_squared_error(y_test_vicost, y_pred_vicost_original))

r2_visitors = r2_score(y_test_visitors, y_pred_visitors_original)
r2_vicost = r2_score(y_test_vicost, y_pred_vicost_original)

print("Basic Data Statistics:")
print(f"Mean of visitors: {y_visitors.mean():.2f}, Std of visitors: {y_visitors.std():.2f}, Var of visitors: {y_visitors.var():.2f}")
print(f"Mean of vicost: {y_vicost.mean():.2f}, Std of vicost: {y_vicost.std():.2f}, Var of vicost: {y_vicost.var():.2f}")
print("\n")

print(f'RMSE for visitors model: {rmse_visitors:.2f}')
print(f'RMSE for vicost model: {rmse_vicost:.2f}')
print(f'R^2 for visitors model: {r2_visitors:.2f}')
print(f'R^2 for vicost model: {r2_vicost:.2f}')


Basic Data Statistics:
Mean of visitors: 186847.18, Std of visitors: 164854.73, Var of visitors: 27177080560.74
Mean of vicost: inf, Std of vicost: nan, Var of vicost: nan


RMSE for visitors model: 92242.31
RMSE for vicost model: 0.03
R^2 for visitors model: 0.31
R^2 for vicost model: -2756.80
