In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
data = pd.read_csv('../data/df.csv')
   
data = data.dropna()

def iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    df_clipped = df.copy()
    
    for column in columns:
        df_clipped[column] = df[column].clip(lower=lower[column], upper=upper[column])
        
    return df_clipped

data = iqr(data, ['visitors', 'visit/cost'])

data = pd.get_dummies(data, columns=['target'], drop_first=False)

X = data[['month', 'cost', 'target_family', 'target_old', 'target_youth']]
y_visitors = data['visitors']
y_vicost = data['visit/cost'] 
X_train, X_test, y_train_visitors, y_test_visitors, y_train_vicost, y_test_vicost = train_test_split(
X, y_visitors, y_vicost, test_size=0.2, random_state=42)


RF

In [23]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

def train_models(X_train, y_train_visitors, y_train_vicost):
    rf_visitors = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_visitors.fit(X_train, y_train_visitors)

    rf_vicost = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_vicost.fit(X_train, y_train_vicost)
    
    gb_visitors = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_visitors.fit(X_train, y_train_visitors)
    
    gb_vicost = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_vicost.fit(X_train, y_train_vicost)
    
    return rf_visitors, rf_vicost, gb_visitors, gb_vicost

rf_visitors, rf_vicost, gb_visitors, gb_vicost = train_models(X_train, y_train_visitors, y_train_vicost)


In [38]:
import numpy as np
from sklearn.model_selection import GridSearchCV

def tune_rf(X_train, y_train_visitors, y_train_vicost):
    param_grid_rf = {
        'n_estimators': np.arange(10, 150, 20),
        'max_depth': [3,4,5],
        'min_samples_split':  np.arange(2, 8)}

    rf_visitors = RandomForestRegressor(random_state=42)
    grid_search_rf_visitors = GridSearchCV(estimator=rf_visitors, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf_visitors.fit(X_train, y_train_visitors)
    
    rf_vicost = RandomForestRegressor(random_state=42)
    grid_search_rf_vicost = GridSearchCV(estimator=rf_vicost, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf_vicost.fit(X_train, y_train_vicost)
    
    print(f"Best visitors RF : {grid_search_rf_visitors.best_params_}")
    print(f"Best visit/cost RF : {grid_search_rf_vicost.best_params_}")

    return grid_search_rf_visitors.best_estimator_, grid_search_rf_vicost.best_estimator_

best_rf_visitors, best_rf_vicost = tune_rf(X_train, y_train_visitors, y_train_vicost)


Best visitors RF : {'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best visit/cost RF : {'max_depth': 3, 'min_samples_split': 7, 'n_estimators': 30}


In [39]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_models(best_visitors, best_vicost, X_test, y_test_visitors, y_test_vicost):
    # Visitors 모델 평가
    y_pred_visitors = best_visitors.predict(X_test)
    mse_visitors = mean_squared_error(y_test_visitors, y_pred_visitors)
    r2_visitors = r2_score(y_test_visitors, y_pred_visitors)

    # Visit/Cost 모델 평가
    y_pred_vicost = best_vicost.predict(X_test)
    mse_vicost = mean_squared_error(y_test_vicost, y_pred_vicost)
    r2_vicost = r2_score(y_test_vicost, y_pred_vicost)

    # 결과 출력
    print(f"Visitors Model Performance:")
    print(f"  MSE: {mse_visitors}")
    print(f"  R²: {r2_visitors}")

    print(f"Visit/Cost Model Performance:")
    print(f"  MSE: {mse_vicost}")
    print(f"  R²: {r2_vicost}")

evaluate_models(best_rf_visitors, best_rf_vicost, X_test, y_test_visitors, y_test_vicost)


Visitors Model Performance:
  MSE: 6273207232.065612
  R²: 0.3321162109236153
Visit/Cost Model Performance:
  MSE: 8.496858353715138e-09
  R²: 0.6071416678754656


xgb

In [27]:
!pip install xgboost



In [29]:
import xgboost as xgb

def train_xgboost(X_train, y_train_visitors, y_train_vicost):
    
    xgb_visitors = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_visitors.fit(X_train, y_train_visitors)

    xgb_vicost = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_vicost.fit(X_train, y_train_vicost)
    
    return xgb_visitors, xgb_vicost

xgb_visitors, xgb_vicost = train_xgboost(X_train, y_train_visitors, y_train_vicost)

In [None]:
def tune_rf(X_train, y_train_visitors, y_train_vicost):
    param_grid_xgb = {
        'n_estimators': np.arange(10,210,20),
        'max_depth': [2, 3, 4],
        'learning_rate': np.arange(0.03,0.1,0.01),
        'subsample': np.arange(0.3,0.8,0.1),
        'colsample_bytree': np.arange(0.1, 0.8, 0.1)}

    xgb_visitors = xgb.XGBRegressor(random_state=42)
    grid_search_xgb_visitors = GridSearchCV(estimator=xgb_visitors, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb_visitors.fit(X_train, y_train_visitors)

    xgb_vicost = xgb.XGBRegressor(random_state=42)
    grid_search_xgb_vicost = GridSearchCV(estimator=xgb_vicost, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb_vicost.fit(X_train, y_train_vicost)
    
    print(f"Best xgb visitors: {grid_search_xgb_visitors.best_params_}")
    print(f"Best xgb visit/cost: {grid_search_xgb_vicost.best_params_}")

    return grid_search_xgb_visitors.best_estimator_, grid_search_xgb_vicost.best_estimator_

best_xgb_visitors, best_xgb_vicost = tune_rf(X_train, y_train_visitors, y_train_vicost)

In [32]:
def evaluate_models(best_visitors, best_vicost, X_test, y_test_visitors, y_test_vicost):
    # Visitors 모델 평가
    y_pred_visitors = best_visitors.predict(X_test)
    mse_visitors = mean_squared_error(y_test_visitors, y_pred_visitors)
    r2_visitors = r2_score(y_test_visitors, y_pred_visitors)

    # Visit/Cost 모델 평가
    y_pred_vicost = best_vicost.predict(X_test)
    mse_vicost = mean_squared_error(y_test_vicost, y_pred_vicost)
    r2_vicost = r2_score(y_test_vicost, y_pred_vicost)

    # 결과 출력
    print(f"Visitors Model Performance:")
    print(f"  MSE: {mse_visitors}")
    print(f"  R²: {r2_visitors}")

    print(f"Visit/Cost Model Performance:")
    print(f"  MSE: {mse_vicost}")
    print(f"  R²: {r2_vicost}")

evaluate_models(best_xgb_visitors, best_xgb_vicost, X_test, y_test_visitors, y_test_vicost)

Visitors Model Performance:
  MSE: 6363955590.185793
  R²: 0.32245458888059375
Visit/Cost Model Performance:
  MSE: 1.553278169715059e-08
  R²: 0.28183071239160096


GBR

In [33]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

def train_gradient_boosting(X_train, y_train_visitors, y_train_vicost):
    gb_visitors = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_visitors.fit(X_train, y_train_visitors)

    gb_vicost = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_vicost.fit(X_train, y_train_vicost)
    
    return gb_visitors, gb_vicost

gb_visitors, gb_vicost = train_gradient_boosting(X_train, y_train_visitors, y_train_vicost)

evaluate_models(gb_visitors, gb_vicost, X_test, y_test_visitors, y_test_vicost)


Visitors Model Performance:
  MSE: 7502673106.285818
  R²: 0.2012197976157578
Visit/Cost Model Performance:
  MSE: 9.136792145909058e-09
  R²: 0.5775538706207888


In [36]:
from sklearn.model_selection import GridSearchCV

def tune_gradient_boosting(X_train, y_train_visitors, y_train_vicost):
    param_grid_gb = {
        'n_estimators': [25, 50, 150, 200],
        'max_depth': [2, 3, 4],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.5, 0.8, 1.0]}
    
    gb_visitors = GradientBoostingRegressor(random_state=42)
    grid_search_gb_visitors = GridSearchCV(estimator=gb_visitors, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')
    grid_search_gb_visitors.fit(X_train, y_train_visitors)

    gb_vicost = GradientBoostingRegressor(random_state=42)
    grid_search_gb_vicost = GridSearchCV(estimator=gb_vicost, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')
    grid_search_gb_vicost.fit(X_train, y_train_vicost)

    print(f"Best parameters for visitors: {grid_search_gb_visitors.best_params_}")
    print(f"Best parameters for visit/cost: {grid_search_gb_vicost.best_params_}")

    return grid_search_gb_visitors.best_estimator_, grid_search_gb_vicost.best_estimator_

best_gb_visitors, best_gb_vicost = tune_gradient_boosting(X_train, y_train_visitors, y_train_vicost)


Best parameters for visitors: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 25, 'subsample': 1.0}
Best parameters for visit/cost: {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 50, 'subsample': 1.0}


In [37]:
evaluate_models(best_gb_visitors, best_gb_vicost, X_test, y_test_visitors, y_test_vicost)

Visitors Model Performance:
  MSE: 7609133369.842461
  R²: 0.18988539057643083
Visit/Cost Model Performance:
  MSE: 8.812427945560647e-09
  R²: 0.5925510817598997


RANSAC

In [None]:
from sklearn.linear_model import RANSACRegressor

def train_ransac(X_train, y_train_visitors, y_train_vicost):
    ransac_visitors = RANSACRegressor(random_state=42)
    ransac_visitors.fit(X_train, y_train_visitors)

    ransac_vicost = RANSACRegressor(random_state=42)
    ransac_vicost.fit(X_train, y_train_vicost)
    
    return ransac_visitors, ransac_vicost

ransac_visitors, ransac_vicost = train_ransac(X_train, y_train_visitors, y_train_vicost)

evaluate_models(ransac_visitors, ransac_vicost, X_test, y_test_visitors, y_test_vicost)
    
