# **Title Name :  제주특산물가격예측**
------------------------------------------------------------
<p style="font-weight:bolder; font-size : 21px">
    Step : 2
<p>
<p style="font-weight:bolder; font-size : 21px">
   RegDate : 2024.03.12



## 데이터셋 설명
---------------------------------

### train.csv
- item: 품목 코드
    - TG : 감귤
    - BC : 브로콜리
    - RD : 무
    - CR : 당근
    - CB : 양배추  
- corporation : 유통 법인 코드
    - 법인 A부터 F 존재 
- location : 지역 코드
    - J : 제주도 제주시
    - S : 제주도 서귀포시
- supply(kg) : 유통된 물량, kg 단위
- price(원/kg) : 유통된 품목들의 kg 마다의 가격, 원 단위

### international_trade.csv
- 관련 품목 수출입 정보
    - 중량 단위 kg
    - 금액 단위 천 달러

# 1. 환경설정
-------------------------------

In [28]:
#===============================================================================
# ▶ 모듈 불러오기
#===============================================================================

# 시스템 
import os 
import sys
import random

# 데이터분석
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 모델
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# 훈련
from sklearn.model_selection import train_test_split

# 최적화
from sklearn.model_selection import GridSearchCV

#전처리
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import holidays



# 기타
import warnings
warnings.filterwarnings(action='ignore') 


In [29]:
#===============================================================================
# ▶ 작업환경
#===============================================================================
IS_GOOGLE = True if 'google.colab'                 in sys.modules   else False
IS_KAGGLE = True if 'KAGGLE_KERNEL_RUN_TYPE'       in os.environ    else False
IS_LOCAL  = True if  not (IS_GOOGLE or IS_KAGGLE)                   else False

In [30]:
#===============================================================================
# ▶ 시드설정
#===============================================================================

# 시드설정
SEED = 2024
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    # torch.manual_seed(SEED)
    # torch.cuda.manual_seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)

# # deterministic 설정
# deterministic = True
# if deterministic:
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark     = False

# set_seed(SEED)

In [31]:
#===============================================================================
# ▶ 데이터 패스설정
#===============================================================================

# 코랩용
if IS_GOOGLE:
    base_path = '/content/drive/MyDrive/프로젝트/제주특산물가격예측/data/'

# 캐글용
elif IS_KAGGLE :
    base_path = '/kaggle/input/__프로젝트명__/'

# 로컬용
elif IS_LOCAL :
    base_path = './data/'

# 트레인&테스트 경로
train_file = base_path+'train.csv'
test_file  = base_path+'test.csv'

# 기타 데이터 경로
international_trade  = base_path+'international_trade.csv'
sample_submission    = base_path+'sample_submission.csv'

# 결과 저장 경로
result_path = './res/'


# 2. 전처리
-------------------------------

In [32]:
#===============================================================================
# ▶ 데이터 불러오기
#===============================================================================

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
international_trade = pd.read_csv(international_trade)
submission = pd.read_csv(sample_submission)

In [33]:
#===============================================================================
# ▶ 테이블 확인
#===============================================================================
print('train')
print('='*150)
display(train_df)
print()

print('test_df')
print('='*150)
display(test_df)
print()

print('international_trade')
print('='*150)
display(international_trade)
print()

print('submission')
print('='*150)
print(submission)
print()

train


Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0



test_df


Unnamed: 0,ID,timestamp,item,corporation,location
0,TG_A_J_20230304,2023-03-04,TG,A,J
1,TG_A_J_20230305,2023-03-05,TG,A,J
2,TG_A_J_20230306,2023-03-06,TG,A,J
3,TG_A_J_20230307,2023-03-07,TG,A,J
4,TG_A_J_20230308,2023-03-08,TG,A,J
...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J
1088,RD_F_J_20230328,2023-03-28,RD,F,J
1089,RD_F_J_20230329,2023-03-29,RD,F,J
1090,RD_F_J_20230330,2023-03-30,RD,F,J



international_trade


Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,2019-01,토마토(신선한 것이나 냉장한 것으로 한정한다),356571,990,0,0,990
1,2019-01,양파,821330,222,4003206,1118,-896
2,2019-01,쪽파,60,1,93405,128,-127
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38
...,...,...,...,...,...,...,...
1269,2023-02,포포(papaw)[파파야(papaya)],0,0,23830,71,-71
1270,2023-02,사과,135165,351,0,0,351
1271,2023-02,배,2206012,5411,1,0,5411
1272,2023-02,신 체리[프루너스 체라서스(Prunus cerasus)],5,0,0,0,0



submission
                   ID  answer
0     TG_A_J_20230304       0
1     TG_A_J_20230305       0
2     TG_A_J_20230306       0
3     TG_A_J_20230307       0
4     TG_A_J_20230308       0
...               ...     ...
1087  RD_F_J_20230327       0
1088  RD_F_J_20230328       0
1089  RD_F_J_20230329       0
1090  RD_F_J_20230330       0
1091  RD_F_J_20230331       0

[1092 rows x 2 columns]



## 각 테이블 확인결과
------------------------

#### 01. train_df와 test_df 컬럼일치시키기
- train_df의 supply(kg)와 price(원/kg)를 제거하면 test_df와 같아짐

#### 02. 불필요한 컬럼 제거
- ID는 불필요함
- timestamp는 연,월,일,요일 등으로 파생피쳐 생성후 제거
- item,corporation,location 컬럼은 원핫인코딩 처리

#### train_X, train_y 분할
- X에 불필요한 컬럼제거(DataFrame)
- y에 타겟피쳐인 price만 추가(Series)

#### test_X
- 불필요한 컬럼제거

#### 훈련(학습) -> 예측(추론)
- train_X,train_Y로 학습
- test_X로 추론

In [34]:
#===============================================================================
# 01. 날짜 데이터 처리
#===============================================================================

# 한국 공휴일 라벨링

def make_holiday(x) :
    holiday = holidays.KR()
    if x in holiday:
        return 1
    else:
        return 0    

# timestamp 날짜형식으로 포맷
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], format='%Y-%m-%d')
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], format='%Y-%m-%d')

# 데이터셋에 날짜 추가
for df in [train_df, test_df]:
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.weekday
    df['day_of_year'] = df['timestamp'].dt.day_of_year
    df['week_of_year'] = df['timestamp'].dt.isocalendar().week
    df["holiday"] = df["timestamp"].map(lambda x : make_holiday(x))
    
    # 컬럼 이름변경
    df.rename(columns={'supply(kg)':'supply', 'price(원/kg)':'price'},inplace=True)

    
train_df
# test_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,day,weekday,day_of_year,week_of_year,holiday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,2,2,1,0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,3,3,1,0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,4,4,1,0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,5,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,0,58,9,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,1,59,9,0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,2,60,9,1
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,3,61,9,0


In [35]:
#===============================================================================
# 02. supply mean 파생피쳐 추가
#===============================================================================
supply_mean = train_df.groupby(['item', 'corporation', 'week_of_year'])['supply'].mean().reset_index()
train_df = pd.merge(train_df, supply_mean, left_on=['item', 'corporation', 'week_of_year'], right_on=['item', 'corporation', 'week_of_year'], how='left').dropna()
test_df = pd.merge(test_df, supply_mean, left_on=['item', 'corporation', 'week_of_year'], right_on=['item', 'corporation', 'week_of_year'], how='left').dropna()
train_df.rename(columns={'supply_x': 'supply', 'supply_y': 'supply_week_of_year_mean', 'price(원/kg)': 'price'}, inplace=True)
test_df.rename(columns={'supply': 'supply_week_of_year_mean'}, inplace=True)

train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,day,weekday,day_of_year,week_of_year,holiday,supply_week_of_year_mean
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,1,1,52808.920588
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,2,2,1,0,52808.920588
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,3,3,1,0,52808.920588
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,4,4,1,0,52808.920588
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,5,5,1,0,52808.920588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,0,58,9,0,376770.303030
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,1,59,9,0,376770.303030
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,2,60,9,1,376770.303030
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,3,61,9,0,376770.303030


In [36]:
#===============================================================================
# 03. 범주형 데이터 처리
#===============================================================================
one_hot = OneHotEncoder()
categorical_features = ['item', 'corporation', 'location', 'year', 'month', 'day', 'weekday',  'day_of_year','week_of_year','holiday']
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

In [37]:
# 파이프라인 생성
model_pipeline= Pipeline([
                            ('trans',transformer),
                            ('model', XGBRegressor(objective='reg:squarederror')),
                            ('model', RandomForestRegressor())
                            
                       ])

In [38]:
# XGBoost 모델 파라미터 그리드 세팅
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__colsample_bytree': [0.7, 0.8],
    'model__subsample': [0.7, 0.8]
}

In [39]:
# 데이터 분할
X = train_df.drop(['price', 'supply', 'timestamp', 'ID'], axis=1)
y = train_df['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [40]:
# #XG Boost 하이퍼파라미터 탐색
# grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
# grid_search.fit(X_train, y_train)

# #최적의 하이퍼파라미터
# print("Best parameters:", grid_search.best_params_)

# #최적의 하이퍼파라미터를 사용하여 모델 재구성
# model_pipeline.set_params(**grid_search.best_params_)


# 3. 훈련및 검증
------------------

In [43]:
# 평가지표생성
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [42]:
# 모델 훈련
model_pipeline.fit(X_train, y_train)


In [44]:
# 검증 데이터에서 성능 평가
valid_predictions = model_pipeline.predict(X_valid)
valid_predictions = np.where(valid_predictions < 0, 0, valid_predictions) # 예측값이 -인 경우 0으로 처리
mse = mean_squared_error(y_valid, valid_predictions)
mae = mean_absolute_error(y_valid, valid_predictions)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"RMSE: {rmse}")


Mean Squared Error: 846582.4167761093
Mean Absolute Error: 400.21705032795643
RMSE: 920.0991342111507


# 4. 예측(inference)
-------------------------------

In [45]:
# 추론
preds = model_pipeline.predict(X_valid)


# 5. 제출(inference)
-------------------------------

In [None]:
submission['answer'] = preds
submission

In [None]:
submission.to_csv(result_path+'XGB_pip'+'baseline_submission.csv', index=False)