# 1. 환경설정
-------------------------------

In [3]:
#===============================================================================
# ▶ 모듈 불러오기
#===============================================================================

# 시스템 
import os 
import sys
import random

# 데이터분석
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 머신러닝
import sklearn
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from xgboost import XGBRegressor



# 기타
import warnings
warnings.filterwarnings(action='ignore') 


In [4]:
#===============================================================================
# ▶ 작업환경
#===============================================================================
IS_GOOGLE = True if 'google.colab'                 in sys.modules   else False
IS_KAGGLE = True if 'KAGGLE_KERNEL_RUN_TYPE'       in os.environ    else False
IS_LOCAL  = True if  not (IS_GOOGLE or IS_KAGGLE)                   else False

In [2]:
#===============================================================================
# ▶ 시드설정
#===============================================================================

# 시드설정
SEED = 2024
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    # torch.manual_seed(SEED)
    # torch.cuda.manual_seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)

# # deterministic 설정
# deterministic = True
# if deterministic:
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark     = False

# set_seed(SEED)

In [21]:
#===============================================================================
# ▶ 데이터 패스설정
#===============================================================================

# 코랩용
if IS_GOOGLE:
    base_path = '/content/drive/MyDrive/프로젝트/__프로젝트폴더명__/data/'

# 캐글용
elif IS_KAGGLE :
    base_path = '/kaggle/input/dogs-vs-cats-redux-kernels-edition/'

# 로컬용
elif IS_LOCAL :
    base_path = './data/'

# 트레인&테스트 경로
train_file = base_path+'train.csv'
test_file  = base_path+'test.csv'

# 기타 데이터 경로
international_trade  = base_path+'international_trade.csv'
sample_submission    = base_path+'sample_submission.csv'

# 결과 저장 경로
result_path = './res/'


# 2. 전처리
-------------------------------

In [22]:
#===============================================================================
# ▶ 데이터 불러오기
#===============================================================================

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
international_trade = pd.read_csv(international_trade)
submission = pd.read_csv(sample_submission)

In [23]:
#===============================================================================
# ▶ 테이블 확인
#===============================================================================
print('train')
display(train_df)
print('='*150)
print()

print('test_df')
display(test_df)
print('='*150)
print()

print('international_trade')
display(international_trade)
print('='*150)
print()

print('sample_submission')
display(sample_submission)
print('='*150)
print()

train


Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0



test_df


Unnamed: 0,ID,timestamp,item,corporation,location
0,TG_A_J_20230304,2023-03-04,TG,A,J
1,TG_A_J_20230305,2023-03-05,TG,A,J
2,TG_A_J_20230306,2023-03-06,TG,A,J
3,TG_A_J_20230307,2023-03-07,TG,A,J
4,TG_A_J_20230308,2023-03-08,TG,A,J
...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J
1088,RD_F_J_20230328,2023-03-28,RD,F,J
1089,RD_F_J_20230329,2023-03-29,RD,F,J
1090,RD_F_J_20230330,2023-03-30,RD,F,J



international_trade


Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,2019-01,토마토(신선한 것이나 냉장한 것으로 한정한다),356571,990,0,0,990
1,2019-01,양파,821330,222,4003206,1118,-896
2,2019-01,쪽파,60,1,93405,128,-127
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38
...,...,...,...,...,...,...,...
1269,2023-02,포포(papaw)[파파야(papaya)],0,0,23830,71,-71
1270,2023-02,사과,135165,351,0,0,351
1271,2023-02,배,2206012,5411,1,0,5411
1272,2023-02,신 체리[프루너스 체라서스(Prunus cerasus)],5,0,0,0,0



sample_submission


'./data/sample_submission.csv'




In [33]:
# 시계열 처리
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

display(train_df)
display(test_df)

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5
...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2


Unnamed: 0,ID,timestamp,item,corporation,location,year,month,day
0,TG_A_J_20230304,2023-03-04,TG,A,J,2023,3,4
1,TG_A_J_20230305,2023-03-05,TG,A,J,2023,3,5
2,TG_A_J_20230306,2023-03-06,TG,A,J,2023,3,6
3,TG_A_J_20230307,2023-03-07,TG,A,J,2023,3,7
4,TG_A_J_20230308,2023-03-08,TG,A,J,2023,3,8
...,...,...,...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J,2023,3,27
1088,RD_F_J_20230328,2023-03-28,RD,F,J,2023,3,28
1089,RD_F_J_20230329,2023-03-29,RD,F,J,2023,3,29
1090,RD_F_J_20230330,2023-03-30,RD,F,J,2023,3,30


In [25]:
# 학습에 불필요한 컬럼 제거
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

test_x = test_df.drop(columns=['ID', 'timestamp'])


In [26]:
# 질적 변수들을 수치화
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i]) 

print('Done.')

Done.


In [34]:
# 테이블 확인
display(train_x )
display(train_y )
display(test_x)

Unnamed: 0,item,corporation,location,year,month,day
0,4,0,0,2019,1,1
1,4,0,0,2019,1,2
2,4,0,0,2019,1,3
3,4,0,0,2019,1,4
4,4,0,0,2019,1,5
...,...,...,...,...,...,...
59392,3,5,0,2023,2,27
59393,3,5,0,2023,2,28
59394,3,5,0,2023,3,1
59395,3,5,0,2023,3,2


0           0.0
1           0.0
2        1728.0
3        1408.0
4        1250.0
          ...  
59392     468.0
59393     531.0
59394     574.0
59395     523.0
59396     529.0
Name: price(원/kg), Length: 59397, dtype: float64

Unnamed: 0,item,corporation,location,year,month,day
0,4,0,0,2023,3,4
1,4,0,0,2023,3,5
2,4,0,0,2023,3,6
3,4,0,0,2023,3,7
4,4,0,0,2023,3,8
...,...,...,...,...,...,...
1087,3,5,0,2023,3,27
1088,3,5,0,2023,3,28
1089,3,5,0,2023,3,29
1090,3,5,0,2023,3,30


# 3. 훈련
-------------------------------

In [27]:
# 훈련 시작
model = XGBRegressor()
model.fit(train_x, train_y)


# 4. 예측(inference)
-------------------------------

In [28]:
# 추론
preds = model.predict(test_x)


# 5. 제출(inference)
-------------------------------

In [29]:
submission['answer'] = preds
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3327.997803
1,TG_A_J_20230305,2981.728516
2,TG_A_J_20230306,2168.133301
3,TG_A_J_20230307,3015.339600
4,TG_A_J_20230308,3012.901611
...,...,...
1087,RD_F_J_20230327,-282.096405
1088,RD_F_J_20230328,499.539368
1089,RD_F_J_20230329,-135.349609
1090,RD_F_J_20230330,662.404846


In [30]:
submission.to_csv(result_path+'baseline_submission.csv', index=False)