In [76]:
import pandas as pd
import numpy as np

## data

In [77]:
df_Nvisitors_Date = pd.read_csv("Number_of_visitors_date.csv")

In [78]:
df_Nvisitors_Date

Unnamed: 0,_id,industry_major_cate,std_year_month,std_year,std_month,consumption_amount,region
0,65e932a1bb908e26c804492e,전체,202012,2020,12,59888524,강원
1,65e932a1bb908e26c804492f,전체,202001,2020,1,98147222,강원
2,65e932a1bb908e26c8044930,전체,202002,2020,2,69547139,강원
3,65e932a1bb908e26c8044931,전체,202003,2020,3,62071229,강원
4,65e932a1bb908e26c8044932,전체,202004,2020,4,75022238,강원
...,...,...,...,...,...,...,...
5688,65e932a2bb908e26c8045f66,여가서비스업,202305,2023,5,19760050,충북
5689,65e932a2bb908e26c8045f67,쇼핑업,202305,2023,5,21217701,충북
5690,65e932a2bb908e26c8045f68,여행업,202305,2023,5,10149,충북
5691,65e932a2bb908e26c8045f69,숙박업,202305,2023,5,3267158,충북


- 목표변수(target) : consumption_amount 소비량
- 설명변수(features) : industry_major_cate 카테고리, region 지역, std_year 년도, std_month 월

- 선형 회귀 (Linear Regression)
- 결정 트리 회귀 (Decision Tree Regression)
- 랜덤 포레스트 회귀 (Random Forest Regression)
- 서포트 벡터 머신 회귀 (Support Vector Machine Regression)
- 그라디언트 부스팅 회귀 (Gradient Boosting Regression) 

In [79]:
df_Nvisitors_Date = df_Nvisitors_Date[['consumption_amount','industry_major_cate','region','std_year','std_month']]

## Target Encoding
- 범주형 변수를 목표 변수의 평균값으로 인코딩하는 방법
- industry_major_cate 카테고리, region 지역, std_year 년도, std_month 월

In [80]:
df_Nvisitors_Date['industry_major_cate'].unique()  # array(['전체', '운송업', '여행업', '숙박업', '식음료업', '여가서비스업', '쇼핑업']
df_Nvisitors_Date['region'].unique()               # array(['강원', '경기', '경남', '경북', '광주', '대구', '대전', '부산', '서울', '세종', '울산', '인천', '전남', '전북', '제주', '충남', '충북']
df_Nvisitors_Date['std_year'].unique()             # array([2020, 2021, 2022, 2023])
df_Nvisitors_Date['std_month'].unique()            # array([12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]

array([12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [93]:
# Target Encoding 함수 정의
def target_encode(df_Nvisitors_Date, cat_feature, target_feature):
    target_means = df_Nvisitors_Date.groupby(cat_feature)[target_feature].mean()
    df_copy = df_Nvisitors_Date.copy()
    df_copy[cat_feature + '_encoded'] = df_copy[cat_feature].map(target_means)
    return df_copy

# 각 범주형 변수에 대해 Target Encoding 적용
df_Nvisitors_Date = target_encode(df_Nvisitors_Date, 'industry_major_cate', 'consumption_amount')
df_Nvisitors_Date = target_encode(df_Nvisitors_Date, 'region', 'consumption_amount')
df_Nvisitors_Date = target_encode(df_Nvisitors_Date, 'std_year', 'consumption_amount')
df_Nvisitors_Date = target_encode(df_Nvisitors_Date, 'std_month', 'consumption_amount')

In [96]:
df_Nvisitors_Date

Unnamed: 0,consumption_amount,industry_major_cate,region,std_year,std_month,industry_major_cate_encoded,region_encoded,std_year_encoded,std_month_encoded
0,59888524,전체,강원,2020,12,1.809826e+08,2.921488e+07,4.563079e+07,5.059245e+07
1,98147222,전체,강원,2020,1,1.809826e+08,2.921488e+07,4.563079e+07,4.697124e+07
2,69547139,전체,강원,2020,2,1.809826e+08,2.921488e+07,4.563079e+07,4.288800e+07
3,62071229,전체,강원,2020,3,1.809826e+08,2.921488e+07,4.563079e+07,4.740749e+07
4,75022238,전체,강원,2020,4,1.809826e+08,2.921488e+07,4.563079e+07,5.224922e+07
...,...,...,...,...,...,...,...,...,...
5688,19760050,여가서비스업,충북,2023,5,1.336394e+07,2.312608e+07,5.885084e+07,5.794335e+07
5689,21217701,쇼핑업,충북,2023,5,5.971330e+07,2.312608e+07,5.885084e+07,5.794335e+07
5690,10149,여행업,충북,2023,5,2.986090e+05,2.312608e+07,5.885084e+07,5.794335e+07
5691,3267158,숙박업,충북,2023,5,7.024514e+06,2.312608e+07,5.885084e+07,5.794335e+07


In [97]:
oneHotEncoder_cate.get_feature_names_out()

array(['industry_major_cate_쇼핑업', 'industry_major_cate_숙박업',
       'industry_major_cate_식음료업', 'industry_major_cate_여가서비스업',
       'industry_major_cate_여행업', 'industry_major_cate_운송업',
       'industry_major_cate_전체'], dtype=object)

In [84]:
oneHotEncoder_region.get_feature_names_out()

array(['region_강원', 'region_경기', 'region_경남', 'region_경북', 'region_광주',
       'region_대구', 'region_대전', 'region_부산', 'region_서울', 'region_세종',
       'region_울산', 'region_인천', 'region_전남', 'region_전북', 'region_제주',
       'region_충남', 'region_충북'], dtype=object)

In [85]:
oneHotEncoder_year.get_feature_names_out()

array(['std_year_2020', 'std_year_2021', 'std_year_2022', 'std_year_2023'],
      dtype=object)

In [86]:
oneHotEncoder_month.get_feature_names_out()

array(['std_month_1', 'std_month_2', 'std_month_3', 'std_month_4',
       'std_month_5', 'std_month_6', 'std_month_7', 'std_month_8',
       'std_month_9', 'std_month_10', 'std_month_11', 'std_month_12'],
      dtype=object)

In [87]:
encoder_cate = oneHotEncoder_cate.transform(df_Nvisitors_Date[['industry_major_cate']]).toarray()
encoder_cate.shape

(5693, 7)

In [88]:
encoder_region = oneHotEncoder_region.transform(df_Nvisitors_Date[['region']]).toarray()
encoder_region.shape

(5693, 17)

In [89]:
encoder_year = oneHotEncoder_year.transform(df_Nvisitors_Date[['std_year']]).toarray()
encoder_year.shape

(5693, 4)

In [90]:
encoder_month = oneHotEncoder_month.transform(df_Nvisitors_Date[['std_month']]).toarray()
encoder_month.shape

(5693, 12)

In [91]:
df_encoder_cate = pd.DataFrame(data=encoder_cate, columns=oneHotEncoder_cate.get_feature_names_out())
df_encoder_cate[:2]

Unnamed: 0,industry_major_cate_쇼핑업,industry_major_cate_숙박업,industry_major_cate_식음료업,industry_major_cate_여가서비스업,industry_major_cate_여행업,industry_major_cate_운송업,industry_major_cate_전체
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [92]:
df_encoder_region = pd.DataFrame(data=encoder_region, columns=oneHotEncoder_region.get_feature_names_out())
df_encoder_region[:2]

Unnamed: 0,region_강원,region_경기,region_경남,region_경북,region_광주,region_대구,region_대전,region_부산,region_서울,region_세종,region_울산,region_인천,region_전남,region_전북,region_제주,region_충남,region_충북
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
df_encoder_year = pd.DataFrame(data=encoder_year, columns=oneHotEncoder_year.get_feature_names_out())
df_encoder_year[:2]

Unnamed: 0,std_year_2020,std_year_2021,std_year_2022,std_year_2023
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0


In [56]:
df_encoder_month = pd.DataFrame(data=encoder_month, columns=oneHotEncoder_month.get_feature_names_out())
df_encoder_month[:2]

Unnamed: 0,std_month_1,std_month_2,std_month_3,std_month_4,std_month_5,std_month_6,std_month_7,std_month_8,std_month_9,std_month_10,std_month_11,std_month_12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
df_Nvisitors_Date = pd.concat((df_Nvisitors_Date.reset_index(drop=True), df_encoder_cate.reset_index(drop=True)), axis=1)
df_Nvisitors_Date = pd.concat((df_Nvisitors_Date.reset_index(drop=True), df_encoder_region.reset_index(drop=True)), axis=1)
df_Nvisitors_Date = pd.concat((df_Nvisitors_Date.reset_index(drop=True), df_encoder_year.reset_index(drop=True)), axis=1)
df_Nvisitors_Date = pd.concat((df_Nvisitors_Date.reset_index(drop=True), df_encoder_month.reset_index(drop=True)), axis=1)
df_Nvisitors_Date[:2]

Unnamed: 0,consumption_amount,industry_major_cate,region,std_year,std_month,industry_major_cate_쇼핑업,industry_major_cate_숙박업,industry_major_cate_식음료업,industry_major_cate_여가서비스업,industry_major_cate_여행업,...,std_month_3,std_month_4,std_month_5,std_month_6,std_month_7,std_month_8,std_month_9,std_month_10,std_month_11,std_month_12
0,59888524,전체,강원,2020,12,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,98147222,전체,강원,2020,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 정형화 단계 - target과 feature 분리

In [98]:
target_train = df_Nvisitors_Date['consumption_amount']  # 목표변수
features_train = df_Nvisitors_Date.drop(columns=['consumption_amount', 'industry_major_cate', 'region', 'std_year', 'std_month'])  # 설명변수
target_train.shape, features_train.shape

((5693,), (5693, 4))

## 모델

In [99]:
# 선형 회귀(Linear Regression)
from sklearn.linear_model import LinearRegression
model_LinearRegression = LinearRegression()
model_LinearRegression

In [100]:
# 결정 트리 회귀 (Decision Tree Regression)
from sklearn.tree import DecisionTreeRegressor
model_DecisionTreeRegressor = DecisionTreeRegressor()
model_DecisionTreeRegressor

In [101]:
# 랜덤 포레스트 회귀 (Random Forest Regression)
from sklearn.ensemble import RandomForestRegressor
model_RandomForestRegressor = RandomForestRegressor()
model_RandomForestRegressor

In [102]:
# 서포트 벡터 머신 회귀 (Support Vector Machine Regression)
from sklearn.svm import SVR
model_SVR = SVR()
model_SVR

In [103]:
# 그라디언트 부스팅 회귀 (Gradient Boosting Regression)
from sklearn.ensemble import GradientBoostingRegressor
model_GradientBoostingRegressor = GradientBoostingRegressor()
model_GradientBoostingRegressor

In [104]:
model_LinearRegression.fit(features_train, target_train)

In [105]:
model_DecisionTreeRegressor.fit(features_train, target_train)

In [106]:
model_RandomForestRegressor.fit(features_train, target_train)

In [107]:
model_SVR.fit(features_train, target_train)

In [108]:
model_GradientBoostingRegressor.fit(features_train, target_train)

## 평가

In [121]:
df_Nvisitors_Date[30:35]

Unnamed: 0,consumption_amount,industry_major_cate,region,std_year,std_month,industry_major_cate_encoded,region_encoded,std_year_encoded,std_month_encoded
30,10151006,숙박업,강원,2020,11,7024514.0,29214880.0,45630790.0,53135880.0
31,87861,운송업,강원,2020,11,6903884.0,29214880.0,45630790.0,53135880.0
32,13477240,쇼핑업,강원,2020,11,59713300.0,29214880.0,45630790.0,53135880.0
33,14471307,여가서비스업,강원,2020,11,13363940.0,29214880.0,45630790.0,53135880.0
34,45772254,식음료업,강원,2020,11,93685300.0,29214880.0,45630790.0,53135880.0


In [146]:
model_DecisionTreeRegressor.predict(features_train[30:35])
# 실제값 : 10151006, 87861, 13477240, 14471307, 45772254
# 예측값 : 10330215.74, 91402.88, 13647203.87, 15064529.54, 47389272.86

array([10151006.,    87861., 13477240., 14471307., 45772254.])

### 평가 수치

In [147]:
target_train_predict = model_DecisionTreeRegressor.predict(features_train)
target_train_predict.shape

(5693,)

In [148]:
# 정확도
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(target_train, target_train_predict)
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 0.0


In [149]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(target_train, target_train_predict)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 0.0
