In [1]:
import pandas as pd
import numpy as np

In [2]:
np.set_printoptions(formatter={'all': '{:f}'.format})

## data

In [3]:
df_Nvisitors_Date = pd.read_csv("Number_of_visitors_date.csv")

In [4]:
df_Nvisitors_Date

Unnamed: 0,_id,industry_major_cate,std_year_month,std_year,std_month,consumption_amount,region
0,65e932a1bb908e26c804492e,전체,202012,2020,12,59888524,강원
1,65e932a1bb908e26c804492f,전체,202001,2020,1,98147222,강원
2,65e932a1bb908e26c8044930,전체,202002,2020,2,69547139,강원
3,65e932a1bb908e26c8044931,전체,202003,2020,3,62071229,강원
4,65e932a1bb908e26c8044932,전체,202004,2020,4,75022238,강원
...,...,...,...,...,...,...,...
5688,65e932a2bb908e26c8045f66,여가서비스업,202305,2023,5,19760050,충북
5689,65e932a2bb908e26c8045f67,쇼핑업,202305,2023,5,21217701,충북
5690,65e932a2bb908e26c8045f68,여행업,202305,2023,5,10149,충북
5691,65e932a2bb908e26c8045f69,숙박업,202305,2023,5,3267158,충북


- 목표변수(target) : consumption_amount 소비량
- 설명변수(features) : industry_major_cate 카테고리, region 지역, std_year 년도, std_month 월

- 선형 회귀 (Linear Regression)
- 결정 트리 회귀 (Decision Tree Regression)
- 랜덤 포레스트 회귀 (Random Forest Regression)
- 서포트 벡터 머신 회귀 (Support Vector Machine Regression)
- 그라디언트 부스팅 회귀 (Gradient Boosting Regression) 

In [5]:
df_Nvisitors_Date = df_Nvisitors_Date[['consumption_amount','industry_major_cate','region','std_year','std_month']]

## Label Encoding
- 각 범주형 값에 연속적인 정수를 부여하는 방법
- 연도나 월 같이 순서 또는 크기가 있고, 숫자 간의 관계를 고려해야 할 때 적합
- std_year 년도, std_month 월

In [6]:
df_Nvisitors_Date['industry_major_cate'].unique()  # array(['전체', '운송업', '여행업', '숙박업', '식음료업', '여가서비스업', '쇼핑업']
df_Nvisitors_Date['region'].unique()               # array(['강원', '경기', '경남', '경북', '광주', '대구', '대전', '부산', '서울', '세종', '울산', '인천', '전남', '전북', '제주', '충남', '충북']
df_Nvisitors_Date['std_year'].unique()             # array([2020, 2021, 2022, 2023])
df_Nvisitors_Date['std_month'].unique()            # array([12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]

array([2020.000000, 2021.000000, 2022.000000, 2023.000000])

In [7]:
# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# 'std_year' 및 'std_month' 컬럼에 대해 라벨 인코딩 적용
df_Nvisitors_Date['std_year_encoded'] = label_encoder.fit_transform(df_Nvisitors_Date['std_year'])
df_Nvisitors_Date['std_month_encoded'] = label_encoder.fit_transform(df_Nvisitors_Date['std_month'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Nvisitors_Date['std_year_encoded'] = label_encoder.fit_transform(df_Nvisitors_Date['std_year'])


## Feature Importance Encoding
- 각 범주형 변수가 목표 변수에 미치는 영향력을 고려하여 인코딩하는 방법
- industry_major_cate 카테고리, region 지역

In [8]:
# Feature Importance를 측정하고 중요도에 따른 순서 정의
ordering_industry = df_Nvisitors_Date.groupby('industry_major_cate')['consumption_amount'].mean().sort_values().index
ordering_region = df_Nvisitors_Date.groupby('region')['consumption_amount'].mean().sort_values().index

# 중요도에 따라 범주를 재정렬하고 라벨 인코딩
df_Nvisitors_Date['industry_major_cate_encoded'] = df_Nvisitors_Date['industry_major_cate'].map({k: i for i, k in enumerate(ordering_industry)})
df_Nvisitors_Date['region_encoded'] = df_Nvisitors_Date['region'].map({k: i for i, k in enumerate(ordering_region)})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Nvisitors_Date['industry_major_cate_encoded'] = df_Nvisitors_Date['industry_major_cate'].map({k: i for i, k in enumerate(ordering_industry)})


In [9]:
# # Target Encoding
# def target_encode(df_Nvisitors_Date, cat_feature, target_feature):
#     target_means = df_Nvisitors_Date.groupby(cat_feature)[target_feature].mean()
#     df_copy = df_Nvisitors_Date.copy()
#     df_copy[cat_feature + '_encoded'] = df_copy[cat_feature].map(target_means)
#     return df_copy

# # 각 범주형 변수에 대해 Target Encoding 적용
# df_Nvisitors_Date = target_encode(df_Nvisitors_Date, 'industry_major_cate', 'consumption_amount')
# df_Nvisitors_Date = target_encode(df_Nvisitors_Date, 'region', 'consumption_amount')

In [27]:
df_Nvisitors_Date

Unnamed: 0,consumption_amount,industry_major_cate,region,std_year,std_year_encoded,industry_major_cate_encoded,region_encoded
0,59888524,전체,강원,2020,0,6,9
1,98147222,전체,강원,2020,0,6,9
2,69547139,전체,강원,2020,0,6,9
3,62071229,전체,강원,2020,0,6,9
4,75022238,전체,강원,2020,0,6,9
...,...,...,...,...,...,...,...
5688,19760050,여가서비스업,충북,2023,3,3,5
5689,21217701,쇼핑업,충북,2023,3,4,5
5690,10149,여행업,충북,2023,3,0,5
5691,3267158,숙박업,충북,2023,3,2,5


## 정형화 단계 - target과 feature 분리

In [30]:
from sklearn.model_selection import train_test_split

y = df_Nvisitors_Date['consumption_amount']  # 목표변수
X = df_Nvisitors_Date[['industry_major_cate_encoded', 'region_encoded', 'std_year_encoded']]  # 설명변수

# train, test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4554, 3), (1139, 3), (4554,), (1139,))

## 모델

In [31]:
# 선형 회귀(Linear Regression)
from sklearn.linear_model import LinearRegression
model_LinearRegression = LinearRegression()
model_LinearRegression

In [32]:
# 결정 트리 회귀 (Decision Tree Regression)
from sklearn.tree import DecisionTreeRegressor
model_DecisionTreeRegressor = DecisionTreeRegressor()
model_DecisionTreeRegressor

In [33]:
# 랜덤 포레스트 회귀 (Random Forest Regression)
from sklearn.ensemble import RandomForestRegressor
model_RandomForestRegressor = RandomForestRegressor()
model_RandomForestRegressor

In [34]:
# 서포트 벡터 머신 회귀 (Support Vector Machine Regression)
from sklearn.svm import SVR
model_SVR = SVR()
model_SVR

In [35]:
# 그라디언트 부스팅 회귀 (Gradient Boosting Regression)
from sklearn.ensemble import GradientBoostingRegressor
model_GradientBoostingRegressor = GradientBoostingRegressor()
model_GradientBoostingRegressor

In [36]:
model_LinearRegression.fit(X_train, y_train)

In [37]:
model_DecisionTreeRegressor.fit(X_train, y_train)

In [38]:
model_RandomForestRegressor.fit(X_train, y_train)

In [39]:
model_SVR.fit(X_train, y_train)

In [40]:
model_GradientBoostingRegressor.fit(X_train, y_train)

## 평가

In [41]:
df_Nvisitors_Date[30:35]

Unnamed: 0,consumption_amount,industry_major_cate,region,std_year,std_year_encoded,industry_major_cate_encoded,region_encoded
30,10151006,숙박업,강원,2020,0,2,9
31,87861,운송업,강원,2020,0,1,9
32,13477240,쇼핑업,강원,2020,0,4,9
33,14471307,여가서비스업,강원,2020,0,3,9
34,45772254,식음료업,강원,2020,0,5,9


In [42]:
model_DecisionTreeRegressor.predict(X_test[30:35])
# 실제값 : 10151006, 87861, 13477240, 14471307, 45772254
# 예측값 : 

array([1407002.363636, 125233.888889, 73634111.111111, 395427951.142857,
       1460026.857143])

### 평가 수치

In [43]:
target_test_predict = model_DecisionTreeRegressor.predict(X_test)
target_test_predict.shape

(1139,)

In [44]:
# MSE - 평균 제곱 오차 / 모델이 예측한 값과 실제 값 사이의 차이를 제곱한 후 평균을 구한 값
# 0과 가까울 수록 모델의 예측이 정확함
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, target_test_predict)
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 287561541361968.1


In [45]:
# MAE - 평균 절대 오차 / 모델이 예측한 값과 실제 값 사이의 절대값 차이를 평균화한 값
# 0과 가까울 수록 모델의 예측이 정확함
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, target_test_predict)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 6109769.494281331


### 평가 결과 : 안좋음