# Column에 대한 설명
'festive_period' : 축제 기간  
'cost' : 총비용  
'target_a' : a 연령대가 많이 방문하는 축제 {'old' : 장년층, 'family' : 가족 연령대(아이 + 부모), 'youth' : 청년 }  
'non_festival_conc' : 비축제 기간 축제 개최 '행정동' 일 평균 외부 방문자 수 / 축제 개최 '시군구' 일 평균 외부 방문자 수
'non_local' : 이동통신 데이터 기반 축제 개최 행정동 일 평균 현지인 방문자 수  
'non_foreigner' : 이동통신 데이터 기반 축제 개최 행정동 일 평균 외부 방문자 수(외지인+외국인)
'month_a' : a 월
'~' : 해당 행정 도 or 특별시

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
data = pd.read_csv('../data/df_charac.csv')

data.drop(columns='index_y' ,inplace= True)

#이상치 처리 함수 정의
def iqr(df, columns):
    df_clipped = df.copy()
    
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        
        df_clipped[column] = df[column].clip(lower=lower, upper=upper)
        
    return df_clipped

# 한글 column명 영어로 변경
data.rename(columns={'축제기간(일)' : "festive_period"}, inplace=True)

# 축제 기간 nan 값 평균치로 채워넣기.
data['festive_period'].fillna(data['festive_period'].mean(), inplace=True)

#iqr
data = iqr(data, ['visitors'])

# str 데이터 one_hot_encoding으로 분리, month는 category에 속해서 굳이 안해도 되는데 1~12월이 숫자가 늘어난다고 방문자 수가 늘어나는 구조는 아니라서 one-hot으로 변경. 약간의 R2 상승을 봤음.
data = pd.get_dummies(data, columns=['target'], drop_first=False)
data = pd.get_dummies(data, columns=['month'], drop_first=False)
data = pd.get_dummies(data, columns=['도'], drop_first=False)

#column 명 정리
data.columns = ['Unnamed: 0', 'Festival', 'name_year', 'year', 'visitors', 'cost',
       'date', 'visit/cost', 'Fe_festival_conc', 'Fe_foreigner', 'Fe_local',
       'Fe_navi', 'Fe_tour_fee', 'non_festival_conc', 'non_foreigner',
       'non_local', 'non_navi', 'non_tour_fee', 'festive_period',
       'target_family', 'target_old', 'target_youth', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12', 'Gangwon','Gyeonggi',
                'Gyeongnam', 
                'Gyeongbuk', 
                'Gwangju', 
                'Daegu',
                'Daejeon', 
                'Busan',
                'Seoul', 
                'Sejong', 
                'Ulsan', 
                'Incheon', 
                'Jeonnam', 
                'Jeonbuk', 
                'Jeju', 
                'Chungnam', 
                'Chungbuk']

# train_test_split : target 은 visitors.
X = data[['festive_period',
          'cost', 
          'target_family', 
          'target_old', 
          'target_youth',
          'non_festival_conc',
          'non_local',
          'non_foreigner',
          'month_1', 
          'month_2',
          'month_3', 
          'month_4', 
          'month_5', 
          'month_6', 
          'month_7', 
          'month_8',
          'month_9', 
          'month_10', 
          'month_11', 
          'month_12',
          'Gangwon',
          'Gyeonggi',                
          'Gyeongnam',                 
          'Gyeongbuk',                 
          'Gwangju',                 
          'Daegu',                
          'Daejeon',                
          'Busan',                
          'Seoul',                 
          'Sejong',                 
          'Ulsan',                 
          'Incheon',                 
          'Jeonnam',                 
          'Jeonbuk',                 
          'Jeju',                 
          'Chungnam',                 
          'Chungbuk'
          ]]
Y = data['visitors']
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['festive_period'].fillna(data['festive_period'].mean(), inplace=True)


# 모델 평가 함수 evaluate_model

In [3]:
# 모델 평가 함수 제작. mse 와 R2score 사용. 모델마다 평가할때 evaluate_model로 평가.

from sklearn.metrics import mean_squared_error, r2_score

def evaluate_models(best_visitors, X_test, Y_test):
    # Visitors 모델 평가
    y_pred = best_visitors.predict(X_test)
    mse_visitors = mean_squared_error(Y_test, y_pred)
    r2_visitors = r2_score(Y_test, y_pred)


    # 결과 출력
    print(f"Visitors Model Performance:")
    print(f"  MSE: {mse_visitors}")
    print(f"  R²: {r2_visitors}")


# RandomForestRegressor

In [4]:
# train_test 결과값.

from sklearn.ensemble import RandomForestRegressor

def train_models(X_train, Y_train):
    rf_visitors = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_visitors.fit(X_train, Y_train)

    return rf_visitors
rf_visitors = train_models(X_train, Y_train)

evaluate_models(rf_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2353736549.5034213
  R²: 0.6307875684537696


In [5]:
# 그리드 서치를 통한 하이퍼 파라미터 조정
from sklearn.model_selection import GridSearchCV

def tune_rf(X_train, Y_train):
    param_grid_rf = {
        'n_estimators': np.arange(40, 60, 10),
        'max_depth': [16, 17 ,18],
        'min_samples_split':  [3,4,5]}

    rf_visitors = RandomForestRegressor(random_state=42, n_jobs=-1)
    grid_search_rf_visitors = GridSearchCV(estimator=rf_visitors, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf_visitors.fit(X_train, Y_train)
    
    print(f"Best visitors RF : {grid_search_rf_visitors.best_params_}")

    return grid_search_rf_visitors.best_estimator_


best_rf_visitors = tune_rf(X_train, Y_train)


Best visitors RF : {'max_depth': 18, 'min_samples_split': 3, 'n_estimators': 50}


In [6]:
# RandomForest 모델 평가
evaluate_models(best_rf_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2456103963.7405896
  R²: 0.6147299846389713


In [7]:
# K Fold와 cv score 추가로 도입.
# grid search에서 이미 cv = 5를 적용하였지만, 각 fold별 변동폭이 커서, 전체 데이터 셋으로 체크.

from sklearn.model_selection import KFold, cross_val_score


kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(best_rf_visitors, X, Y, cv=kf)

print("Fold별 R2 score:", cv_scores)
print("평균 R2 score:", np.mean(cv_scores))


Fold별 R2 score: [0.45855016 0.71387008 0.59560164 0.44997592 0.67518227 0.50506883
 0.70481614 0.63456602 0.39513638 0.58708823]
평균 R2 score: 0.5719855659706026


# Xgboost

In [8]:
# train test 기본 결과값.
import xgboost as xgb

def train_xgboost(X_train, Y_train):
    
    xgb_visitors = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_visitors.fit(X_train, Y_train)

    xgb_vicost = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_vicost.fit(X_train, Y_train)
    
    return xgb_visitors

xgb_visitors = train_xgboost(X_train, Y_train)

evaluate_models(xgb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2227631920.3158746
  R²: 0.6505686254209959


In [9]:
# 그리드 서치로 하이퍼 파라미터 조정
from sklearn.model_selection import GridSearchCV
def tune_xgb(X_train, Y_train):
    param_grid_xgb = {
        'n_estimators': np.arange(90,110,10),
        'max_depth': [7,8, 9,10],
        'learning_rate': [0.03],
        'subsample': [0.75,0.8,0.85],
        'colsample_bytree': np.arange(0.5, 0.7, 0.1)}

    xgb_visitors = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    grid_search_xgb_visitors = GridSearchCV(estimator=xgb_visitors, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb_visitors.fit(X_train, Y_train)
    
    print(f"Best xgb visitors: {grid_search_xgb_visitors.best_params_}")

    return grid_search_xgb_visitors.best_estimator_

best_xgb_visitors = tune_xgb(X_train, Y_train)

Best xgb visitors: {'colsample_bytree': 0.6, 'learning_rate': 0.03, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.8}


In [10]:
# 모델 평가 함수로 조정된 sgboost 결과 확인.
evaluate_models(best_xgb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2409206009.0600166
  R²: 0.6220865037387034


In [11]:
# K Fold와 cv score 추가로 도입.
# grid search에서 이미 cv = 5를 적용하였지만, 각 fold별 변동폭이 커서, 전체 데이터 셋으로 체크.

from sklearn.model_selection import KFold, cross_val_score


kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(best_xgb_visitors, X, Y, cv=kf)

print("Fold별 R2 score:", cv_scores)
print("평균 R2 score:", np.mean(cv_scores))


Fold별 R2 score: [0.52314623 0.71136974 0.66568977 0.48362636 0.65164548 0.52888739
 0.68887691 0.64130707 0.47136475 0.67200936]
평균 R2 score: 0.6037923061766147


# ====================

# GradientBoostingRegressor -- 최종 선정

In [12]:
#기본 GradientBoosting

from sklearn.ensemble import GradientBoostingRegressor

def train_gradient_boosting(X_train, Y_train):
    gb_visitors = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_visitors.fit(X_train, Y_train)

    return gb_visitors

gb_visitors = train_gradient_boosting(X_train, Y_train)

evaluate_models(gb_visitors, X_test, Y_test)


Visitors Model Performance:
  MSE: 2328691116.999703
  R²: 0.634716251566489


In [13]:
# 그리드 서치로 하이퍼 파라미터 검정
from sklearn.model_selection import GridSearchCV
def tune_gb(X_train, Y_train):
    param_grid_gb = {
        'n_estimators': [190,200,210],
        'max_depth': [4, 5, 6],
        'learning_rate': [0.04, 0.02, 0.03],
        'subsample': [0.7, 0.75, 0.8]}
    
    gb_visitors = GradientBoostingRegressor(random_state=42)
    grid_search_gb_visitors = GridSearchCV(estimator=gb_visitors, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')
    grid_search_gb_visitors.fit(X_train, Y_train)

    print(f"Best gb visitors: {grid_search_gb_visitors.best_params_}")

    return grid_search_gb_visitors.best_estimator_

best_gb_visitors = tune_gb(X_train, Y_train)


Best gb visitors: {'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 210, 'subsample': 0.7}


In [14]:
# 평가함수로 모델 체크.
evaluate_models(best_gb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2286369554.6735315
  R²: 0.6413549074248


In [15]:
# K Fold와 cv score 추가로 도입.
# grid search에서 이미 cv = 5를 적용하였지만, 각 fold별 변동폭이 커서, 전체 데이터 셋으로 한번 더 체크.

from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(best_gb_visitors, X, Y, cv=kf)

print("Fold별 R2 score:", cv_scores)
print("평균 R2 score:", np.mean(cv_scores))


Fold별 R2 score: [0.53076437 0.71929564 0.7116982  0.46441509 0.67231274 0.56519664
 0.78881469 0.72028834 0.48794763 0.66145688]
평균 R2 score: 0.6322190225724836


# train test split만
Visitors Model Performance:  
  MSE: 2679345855.5847173  
  R²: 0.5797117572472353  
  
# gridsearch 이후.
Visitors Model Performance:  
  MSE: 2066437694.2304952  
  R²: 0.6758539158145839  
  
# K Fold로 다르게 쪼갰을 때도 같은지 검정.
Fold별 R2 score: [0.57802775 0.67556454 0.70368518 0.56395319 0.55637453 0.57743816  
 0.76224563 0.70915654 0.5779924  0.66033483]  
평균 R2 score: 0.6364772749648543  

# ====================
# RANSACRegressor -- 망함.

In [16]:
# 기본 train test 값
from sklearn.linear_model import RANSACRegressor

def train_ransac(X_train, Y_train):
    ransac_visitors = RANSACRegressor(random_state=42)
    ransac_visitors.fit(X_train, Y_train)
    
    return ransac_visitors

ransac_visitors = train_ransac(X_train, Y_train)

evaluate_models(ransac_visitors, X_test, Y_test)
    

Visitors Model Performance:
  MSE: 6843246960.712327
  R²: -0.0734471751177932


In [17]:
# 그리드 서치로 튜닝

def tune_rs(X_train, Y_train):
    param_grid_ransac = {
        'min_samples': [0.5, 0.7, 0.9],
        'residual_threshold': [5, 10, 20],
        'max_trials': [50, 100, 150],
    }
    ransac_visitors = RANSACRegressor(random_state=42)
    grid_search_ransac_visitors = GridSearchCV(estimator=ransac_visitors, param_grid=param_grid_ransac, cv=5, scoring='neg_mean_squared_error')
    grid_search_ransac_visitors.fit(X_train, Y_train)

    print(f"Best rs visitors: {grid_search_ransac_visitors.best_params_}")

    return grid_search_ransac_visitors.best_estimator_

best_rs_visitors = tune_rs(X_train, Y_train)



Best rs visitors: {'max_trials': 50, 'min_samples': 0.7, 'residual_threshold': 20}


In [18]:
#평가 함수로 평가
evaluate_models(best_rs_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 22596239560.557247
  R²: -2.5444971756565486
