In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
data = pd.read_csv('../data/df.csv')
   
data = data.dropna()

def iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    df_clipped = df.copy()
    
    for column in columns:
        df_clipped[column] = df[column].clip(lower=lower[column], upper=upper[column])
        
    return df_clipped

data = iqr(data, ['visitors', 'visit/cost'])

data = pd.get_dummies(data, columns=['target'], drop_first=False)

X = data[['month', 'cost', 'target_family', 'target_old', 'target_youth']]
y_visitors = data['visitors']
y_vicost = data['visit/cost'] 
X_train, X_test, y_train_visitors, y_test_visitors, y_train_vicost, y_test_vicost = train_test_split(
X, y_visitors, y_vicost, test_size=0.2, random_state=42)


Evaluate Model 함수 

In [4]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_models(best_visitors, best_vicost, X_test, y_test_visitors, y_test_vicost):
    # Visitors 모델 평가
    y_pred_visitors = best_visitors.predict(X_test)
    mse_visitors = mean_squared_error(y_test_visitors, y_pred_visitors)
    r2_visitors = r2_score(y_test_visitors, y_pred_visitors)

    # Visit/Cost 모델 평가
    y_pred_vicost = best_vicost.predict(X_test)
    mse_vicost = mean_squared_error(y_test_vicost, y_pred_vicost)
    r2_vicost = r2_score(y_test_vicost, y_pred_vicost)

    # 결과 출력
    print(f"Visitors Model Performance:")
    print(f"  MSE: {mse_visitors}")
    print(f"  R²: {r2_visitors}")

    print(f"Visit/Cost Model Performance:")
    print(f"  MSE: {mse_vicost}")
    print(f"  R²: {r2_vicost}")

#evaluate_models(best_rf_visitors, best_rf_vicost, X_test, y_test_visitors, y_test_vicost)


KFold

In [11]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define models and their parameter grids for GridSearch
models = {
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [5, 10, None],
            'min_samples_split': [2, 5]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    },
    'XGBoost': {
        'model': xgb.XGBRegressor(random_state=42, verbosity=0),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    }
}

# Store results
results_visit = {}
results_vicost = {}

# Iterate over models
for model_name, config in models.items():
    print(f"Training {model_name}...")
    grid = GridSearchCV(config['model'], config['params'], cv=kf, scoring='neg_mean_squared_error', verbose=1)
    grid.fit(X_train, y_train_visitors) 
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test_visitors, y_pred)
    r2 = r2_score(y_test_visitors, y_pred)
    results_visit[model_name] = {
        'visit_best_params': grid.best_params_,
        'visit_mse': mse,
        'visit_r2': r2
    }


for model_name, config in models.items():
    print(f"Training {model_name}...")
    grid = GridSearchCV(config['model'], config['params'], cv=kf, scoring='neg_mean_squared_error', verbose=1)
    grid.fit(X_train, y_train_vicost) 
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test_vicost, y_pred)
    r2 = r2_score(y_test_vicost, y_pred)
    results_vicost[model_name] = {
        'vicost_best_params': grid.best_params_,
        'vicost_mse': mse,
        'vicost_r2': r2
    }




results_visit_df = pd.DataFrame(results_visit).T
results_vicost_df = pd.DataFrame(results_vicost).T


Training RandomForest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Training GradientBoosting...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training RandomForest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Training GradientBoosting...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [12]:
results_visit_df

Unnamed: 0,visit_best_params,visit_mse,visit_r2
RandomForest,"{'max_depth': 5, 'min_samples_split': 2, 'n_es...",5971700691.462646,0.364216
GradientBoosting,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",7090259684.716521,0.245128
XGBoost,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",7057343221.421196,0.248632


In [13]:
results_vicost_df

Unnamed: 0,vicost_best_params,vicost_mse,vicost_r2
RandomForest,"{'max_depth': 5, 'min_samples_split': 2, 'n_es...",0.0,0.66938
GradientBoosting,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.0,0.646882
XGBoost,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.0,0.275401
