In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
data = pd.read_csv('../data/df_charac.csv')
   
data = data.dropna()

def iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    df_clipped = df.copy()
    
    for column in columns:
        df_clipped[column] = df[column].clip(lower=lower[column], upper=upper[column])
        
    return df_clipped

data = iqr(data, ['visitors', 'visit/cost'])

data = pd.get_dummies(data, columns=['target'], drop_first=False)

X = data[['month', 
          'cost', 
          'target_family', 
          'target_old', 
          'target_youth',
          'Fe_festival_conc',
          'non_festival_conc',
          'non_local',
          'non_foreigner'
          ]]
y_visitors = data['visitors']
y_vicost = data['visit/cost'] 
X_train, X_test, y_train_visitors, y_test_visitors, y_train_vicost, y_test_vicost = train_test_split(
X, y_visitors, y_vicost, test_size=0.2, random_state=42)


In [6]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_models(best_visitors, best_vicost, X_test, y_test_visitors, y_test_vicost):
    # Visitors 모델 평가
    y_pred_visitors = best_visitors.predict(X_test)
    mse_visitors = mean_squared_error(y_test_visitors, y_pred_visitors)
    r2_visitors = r2_score(y_test_visitors, y_pred_visitors)

    # Visit/Cost 모델 평가
    y_pred_vicost = best_vicost.predict(X_test)
    mse_vicost = mean_squared_error(y_test_vicost, y_pred_vicost)
    r2_vicost = r2_score(y_test_vicost, y_pred_vicost)

    # 결과 출력
    print(f"Visitors Model Performance:")
    print(f"  MSE: {mse_visitors}")
    print(f"  R²: {r2_visitors}")

    print(f"Visit/Cost Model Performance:")
    print(f"  MSE: {mse_vicost}")
    print(f"  R²: {r2_vicost}")


RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

def train_models(X_train, y_train_visitors, y_train_vicost):
    rf_visitors = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_visitors.fit(X_train, y_train_visitors)

    rf_vicost = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_vicost.fit(X_train, y_train_vicost)
    
    return rf_visitors, rf_vicost

rf_visitors, rf_vicost = train_models(X_train, y_train_visitors, y_train_vicost)

evaluate_models(rf_visitors, rf_vicost, X_test, y_test_visitors, y_test_vicost)

In [14]:
import numpy as np
from sklearn.model_selection import GridSearchCV

def tune_rf(X_train, y_train_visitors, y_train_vicost):
    param_grid_rf = {
        'n_estimators': np.arange(50, 90, 20),
        'max_depth': [3,15,17],
        'min_samples_split':  [2, 7,8]}

    rf_visitors = RandomForestRegressor(random_state=42, n_jobs=-1)
    grid_search_rf_visitors = GridSearchCV(estimator=rf_visitors, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf_visitors.fit(X_train, y_train_visitors)
    
    rf_vicost = RandomForestRegressor(random_state=42, n_jobs=-1)
    grid_search_rf_vicost = GridSearchCV(estimator=rf_vicost, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf_vicost.fit(X_train, y_train_vicost)
    
    print(f"Best visitors RF : {grid_search_rf_visitors.best_params_}")
    print(f"Best visit/cost RF : {grid_search_rf_vicost.best_params_}")

    return grid_search_rf_visitors.best_estimator_, grid_search_rf_vicost.best_estimator_

best_rf_visitors, best_rf_vicost = tune_rf(X_train, y_train_visitors, y_train_vicost)


Best visitors RF : {'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 70}
Best visit/cost RF : {'max_depth': 3, 'min_samples_split': 7, 'n_estimators': 70}


In [15]:
evaluate_models(best_rf_visitors, best_rf_vicost, X_test, y_test_visitors, y_test_vicost)

Visitors Model Performance:
  MSE: 5610478564.934263
  R²: 0.40267433485593784
Visit/Cost Model Performance:
  MSE: 8.18599080688758e-09
  R²: 0.6215148515716399


xgb

In [16]:
import xgboost as xgb

def train_xgboost(X_train, y_train_visitors, y_train_vicost):
    
    xgb_visitors = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_visitors.fit(X_train, y_train_visitors)

    xgb_vicost = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_vicost.fit(X_train, y_train_vicost)
    
    return xgb_visitors, xgb_vicost

xgb_visitors, xgb_vicost = train_xgboost(X_train, y_train_visitors, y_train_vicost)

In [30]:
def tune_xgb(X_train, y_train_visitors, y_train_vicost):
    param_grid_xgb = {
        'n_estimators': np.arange(90,110,10),
        'max_depth': [10, 11, 12],
        'learning_rate': np.arange(0.02,0.04,0.01),
        'subsample': [0.65,0.7,0.75],
        'colsample_bytree': np.arange(0.6, 0.8, 0.1)}

    xgb_visitors = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    grid_search_xgb_visitors = GridSearchCV(estimator=xgb_visitors, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb_visitors.fit(X_train, y_train_visitors)

    xgb_vicost = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    grid_search_xgb_vicost = GridSearchCV(estimator=xgb_vicost, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb_vicost.fit(X_train, y_train_vicost)
    
    print(f"Best xgb visitors: {grid_search_xgb_visitors.best_params_}")
    print(f"Best xgb visit/cost: {grid_search_xgb_vicost.best_params_}")

    return grid_search_xgb_visitors.best_estimator_, grid_search_xgb_vicost.best_estimator_

best_xgb_visitors, best_xgb_vicost = tune_xgb(X_train, y_train_visitors, y_train_vicost)

KeyboardInterrupt: 

In [28]:
evaluate_models(best_xgb_visitors, best_xgb_vicost, X_test, y_test_visitors, y_test_vicost)

Visitors Model Performance:
  MSE: 5590753632.578279
  R²: 0.40477437110117
Visit/Cost Model Performance:
  MSE: 1.593508656767585e-08
  R²: 0.26322985853942094


GBR

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

def train_gradient_boosting(X_train, y_train_visitors, y_train_vicost):
    gb_visitors = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_visitors.fit(X_train, y_train_visitors)

    gb_vicost = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_vicost.fit(X_train, y_train_vicost)
    
    return gb_visitors, gb_vicost

gb_visitors, gb_vicost = train_gradient_boosting(X_train, y_train_visitors, y_train_vicost)

evaluate_models(gb_visitors, gb_vicost, X_test, y_test_visitors, y_test_vicost)


Visitors Model Performance:
  MSE: 7502673106.285818
  R²: 0.2012197976157578
Visit/Cost Model Performance:
  MSE: 9.136792145909058e-09
  R²: 0.5775538706207888


In [12]:
from sklearn.model_selection import GridSearchCV

def tune_gradient_boosting(X_train, y_train_visitors, y_train_vicost):
    param_grid_gb = {
        'n_estimators': [25, 50, 150, 200],
        'max_depth': [2, 3, 4],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.5, 0.8, 1.0]}
    
    gb_visitors = GradientBoostingRegressor(random_state=42)
    grid_search_gb_visitors = GridSearchCV(estimator=gb_visitors, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')
    grid_search_gb_visitors.fit(X_train, y_train_visitors)

    gb_vicost = GradientBoostingRegressor(random_state=42)
    grid_search_gb_vicost = GridSearchCV(estimator=gb_vicost, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')
    grid_search_gb_vicost.fit(X_train, y_train_vicost)

    print(f"Best gb visitors: {grid_search_gb_visitors.best_params_}")
    print(f"Best gb visit/cost: {grid_search_gb_vicost.best_params_}")

    return grid_search_gb_visitors.best_estimator_, grid_search_gb_vicost.best_estimator_

best_gb_visitors, best_gb_vicost = tune_gradient_boosting(X_train, y_train_visitors, y_train_vicost)


Best gb visitors: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 25, 'subsample': 1.0}
Best gb visit/cost: {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 50, 'subsample': 1.0}


In [13]:
evaluate_models(best_gb_visitors, best_gb_vicost, X_test, y_test_visitors, y_test_vicost)

Visitors Model Performance:
  MSE: 7609133369.842461
  R²: 0.18988539057643083
Visit/Cost Model Performance:
  MSE: 8.812427945560647e-09
  R²: 0.5925510817598997


RANSAC

In [15]:
from sklearn.linear_model import RANSACRegressor

def train_ransac(X_train, y_train_visitors, y_train_vicost):
    ransac_visitors = RANSACRegressor(random_state=42)
    ransac_visitors.fit(X_train, y_train_visitors)

    ransac_vicost = RANSACRegressor(random_state=42)
    ransac_vicost.fit(X_train, y_train_vicost)
    
    return ransac_visitors, ransac_vicost

ransac_visitors, ransac_vicost = train_ransac(X_train, y_train_visitors, y_train_vicost)

evaluate_models(ransac_visitors, ransac_vicost, X_test, y_test_visitors, y_test_vicost)
    


Visitors Model Performance:
  MSE: 8816689686.666874
  R²: 0.06132160198017089
Visit/Cost Model Performance:
  MSE: 2.43198254509604e-08
  R²: -0.12444454956067807


In [16]:

def tune_ransac(X_train, y_train_visitors, y_train_vicost):
    param_grid_ransac = {
        'min_samples': [0.5, 0.7, 0.9],
        'residual_threshold': [5, 10, 20],
        'max_trials': [50, 100, 150],
    }

    ransac_visitors = RANSACRegressor(random_state=42, n_jobs=-1)
    grid_search_ransac_visitors = GridSearchCV(estimator=ransac_visitors, param_grid=param_grid_ransac, cv=5, scoring='neg_mean_squared_error')
    grid_search_ransac_visitors.fit(X_train, y_train_visitors)

    ransac_vicost = RANSACRegressor(random_state=42, n_jobs=-1)
    grid_search_ransac_vicost = GridSearchCV(estimator=ransac_vicost, param_grid=param_grid_ransac, cv=5, scoring='neg_mean_squared_error')
    grid_search_ransac_vicost.fit(X_train, y_train_vicost)

    print(f"Best rs visitors: {grid_search_ransac_visitors.best_params_}")
    print(f"Best rs visit/cost: {grid_search_ransac_vicost.best_params_}")

    return grid_search_ransac_visitors.best_estimator_, grid_search_ransac_vicost.best_estimator_

best_rs_visitors, best_rs_vicost = tune_gradient_boosting(X_train, y_train_visitors, y_train_vicost)

Best gb visitors: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 25, 'subsample': 1.0}
Best gb visit/cost: {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 50, 'subsample': 1.0}


In [17]:
evaluate_models(best_rs_visitors, best_rs_vicost, X_test, y_test_visitors, y_test_vicost)

Visitors Model Performance:
  MSE: 7609133369.842461
  R²: 0.18988539057643083
Visit/Cost Model Performance:
  MSE: 8.812427945560647e-09
  R²: 0.5925510817598997
