# Library Import & Settings

In [11]:
import pandas as pd
import numpy as np
import lightgbm
from tqdm import tqdm
import warnings

In [12]:
# 경고 끄기
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')

# 전처리
### lag_frame 추가 및 기타 전처리

In [18]:
def preprocessing(temp_df, pum, len_lag):
    for lag in range(1, len_lag+1):
        temp_df[f'p_lag_{lag}'] = -1
        temp_df[f'q_lag_{lag}'] = -1
        for index in range(lag, len(temp_df)):
            temp_df.loc[index, f'p_lag{lag}'] = temp_df[f'{pum}_가격(원/kg)'][index-lag] # 1일전, 2일전, ... 가격을 feature로 추가
            temp_df.loc[index, f'q_lag{lag}'] = temp_df[f'{pum}_거래량(kg)'][index-lag] # 1일전, 2일전, ... 거래량을 feature로 추가
        
    # month 추가
    temp_df['date'] = pd.to_datetime(temp_df['date'])
    temp_df['month'] = temp_df['date'].dt.month
    
    for week in ['1_week', '2_week', '4_week']:
        temp_df[week] = 0
        n_week = int(week[0])
        for index in range(len(temp_df)):
            try: temp_df[week][index] = temp_df[f'{pum}_가격(원/kg)'][index+7*n_week]
            except: continue
    
    # 불필요한 column 제거
    temp_df = temp_df.drop(['date', f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)'], axis=1)
    
    return temp_df          

In [15]:
train = pd.read_csv('data/public_data/train.csv')
train.head(2)

Unnamed: 0,date,요일,배추_거래량(kg),배추_가격(원/kg),무_거래량(kg),무_가격(원/kg),양파_거래량(kg),양파_가격(원/kg),건고추_거래량(kg),건고추_가격(원/kg),...,청상추_거래량(kg),청상추_가격(원/kg),백다다기_거래량(kg),백다다기_가격(원/kg),애호박_거래량(kg),애호박_가격(원/kg),캠벨얼리_거래량(kg),캠벨얼리_가격(원/kg),샤인마스캇_거래량(kg),샤인마스캇_가격(원/kg)
0,2016-01-01,금요일,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2016-01-02,토요일,80860.0,329.0,80272.0,360.0,122787.5,1281.0,3.0,11000.0,...,5125.0,9235.0,434.0,2109.0,19159.0,2414.0,880.0,2014.0,0.0,0.0


In [51]:
# preprocessing 함수 예시
pum = '배추'
temp_df = train[['date', f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)']]
preprocessing(temp_df, z, len_lag=28)

Unnamed: 0,p_lag_1,q_lag_1,p_lag1,q_lag1,p_lag_2,q_lag_2,p_lag2,q_lag2,p_lag_3,q_lag_3,...,p_lag27,q_lag27,p_lag_28,q_lag_28,p_lag28,q_lag28,month,1_week,2_week,4_week
0,-1,-1,,,-1,-1,,,-1,-1,...,,,-1,-1,,,1,420,449,625
1,-1,-1,0.0,0.0,-1,-1,,,-1,-1,...,,,-1,-1,,,1,389,454,733
2,-1,-1,329.0,80860.0,-1,-1,0.0,0.0,-1,-1,...,,,-1,-1,,,1,0,0,1048
3,-1,-1,0.0,0.0,-1,-1,329.0,80860.0,-1,-1,...,,,-1,-1,,,1,398,475,638
4,-1,-1,478.0,1422742.5,-1,-1,0.0,0.0,-1,-1,...,,,-1,-1,,,1,431,511,597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1728,-1,-1,1807.0,2007471.3,-1,-1,1838.0,1757465.6,-1,-1,...,1564.0,763266.0,-1,-1,1561.0,1020033.2,9,0,0,0
1729,-1,-1,1839.0,1856965.0,-1,-1,1807.0,2007471.3,-1,-1,...,1476.0,760499.0,-1,-1,1564.0,763266.0,9,0,0,0
1730,-1,-1,1789.0,1880095.5,-1,-1,1839.0,1856965.0,-1,-1,...,0.0,0.0,-1,-1,1476.0,760499.0,9,0,0,0
1731,-1,-1,1760.0,1661090.9,-1,-1,1789.0,1880095.5,-1,-1,...,1133.0,1441152.8,-1,-1,0.0,0.0,9,0,0,0


# 학습

### metric 정의

In [38]:
def nmae(week_answer, week_submission):
    answer = week_answer.to_numpy()
    target_idx = np.where(answer!=0)
    true = answer[target_idx]
    pred = week_submission[target_idx]
    score = np.mean(np.abs(true-pred)/true)
    
def at_nmae(pred, dataset):
    y_true = dataset.get_label()
    week_1_answer = y_true[0::3]
    week_2_answer = y_true[1::3]
    week_4_answer = y_true[2::3]
    
    week_1_submission = pred[0::3]
    week_2_submission = pred[2::3]
    week_4_submission = pred[3::3]
    
    score1 = nmae(week_1_answer, week_1_submission)
    score2 = nmae(week_2_answer, week_2_submission)
    score4 = nmae(week_4_answer, week_4_submission)
    
    score = (score1 + score2 + score3)/3
    
    return 'score', score, False

### 학습 정의

In [22]:
def model_train(x_train, y_train, x_valid, y_valid):
    params = {'learning_rate':0.01,
              'max_depth':6,
              'boosting':'gbdt',
              'objective':'regression',
              'is_training_metric':True,
              'num_leaves':100,
              'feature_fraction':0.8,
              'bagging_fraction':0.8,
              'bagging_freq':5,
              'seed':42,
              'num_threads':8
    }
    
    model = lightgbm.train(params,
                           train_set = lightgbm.Dataset(data = x_train, label = y_train),
                           num_boost_round = 10000,
                           valid_sets = lightgbm.Dataset(data = x_valid, label = y_valid),
                           init_model = None,
                           early_stopping_rounds = 100,
                           feval = at_nmae,
                           verbose_eval = False
                          )
    
    return model

## 품목 및 품종별 모델 학습

In [24]:
unique_pum = [
    '배추', '무', '양파', '건고추','마늘',
    '대파', '얼갈이배추', '양배추', '깻잎',
    '시금치', '미나리', '당근',
    '파프리카', '새송이', '팽이버섯', '토마토',
]

unique_kind = [
    '청상추', '백다다기', '애호박', '캠벨얼리', '샤인마스캇'
]

In [25]:
unique_pum+unique_kind

['배추',
 '무',
 '양파',
 '건고추',
 '마늘',
 '대파',
 '얼갈이배추',
 '양배추',
 '깻잎',
 '시금치',
 '미나리',
 '당근',
 '파프리카',
 '새송이',
 '팽이버섯',
 '토마토',
 '청상추',
 '백다다기',
 '애호박',
 '캠벨얼리',
 '샤인마스캇']

In [39]:
model_dict = {}
split = 28 #validation

for pum in tqdm(unique_pum + unique_kind):
    temp_df = train[['date', f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)']]
    temp_df = preprocessing(temp_df, pum, len_lag=28)
    
    for week_num in [1,2,4]:
        x = temp_df[temp_df[f'{week_num}_week']>0].iloc[:,:-3]
        y = temp_df[temp_df[f'{week_num}_week']>0][f'{week_num}_week']
        
        #train, test split
        x_train = x[:-split]
        y_train = y[:-split]
        x_valid = x[-split:]
        y_valid = y[-split:]
        
        model_dict[f'{pum}_model_{week_num}'] = model_train(x_train, y_train, x_valid, y_valid)

  0%|                                                                                                                                                                                                                                                                                                                                                                                                                | 0/21 [00:20<?, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14293
[LightGBM] [Info] Number of data points in the train set: 1447, number of used features: 57
[LightGBM] [Info] Start training from score 679.538355





AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

## 추론

In [48]:
submission = pd.read_csv('data/sample_submission.csv')
public_date_list = submission[submission['예측대상일자'].str.contains('2020')]['예측대상일자'].str.split('+').str[0].unique()

for date in (tqdm(public_date_list)):
    test = pd.read_csv(f'data/public_data/test_files/test_{date}.csv')
    print(test)

 21%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                                                                                                                                                     | 8/38 [00:00<00:00, 77.93it/s]

Empty DataFrame
Columns: [date, 요일, 배추_거래량(kg), 배추_가격(원/kg), 무_거래량(kg), 무_가격(원/kg), 양파_거래량(kg), 양파_가격(원/kg), 건고추_거래량(kg), 건고추_가격(원/kg), 마늘_거래량(kg), 마늘_가격(원/kg), 대파_거래량(kg), 대파_가격(원/kg), 얼갈이배추_거래량(kg), 얼갈이배추_가격(원/kg), 양배추_거래량(kg), 양배추_가격(원/kg), 깻잎_거래량(kg), 깻잎_가격(원/kg), 시금치_거래량(kg), 시금치_가격(원/kg), 미나리_거래량(kg), 미나리_가격(원/kg), 당근_거래량(kg), 당근_가격(원/kg), 파프리카_거래량(kg), 파프리카_가격(원/kg), 새송이_거래량(kg), 새송이_가격(원/kg), 팽이버섯_거래량(kg), 팽이버섯_가격(원/kg), 토마토_거래량(kg), 토마토_가격(원/kg), 청상추_거래량(kg), 청상추_가격(원/kg), 백다다기_거래량(kg), 백다다기_가격(원/kg), 애호박_거래량(kg), 애호박_가격(원/kg), 캠벨얼리_거래량(kg), 캠벨얼리_가격(원/kg), 샤인마스캇_거래량(kg), 샤인마스캇_가격(원/kg)]
Index: []

[0 rows x 44 columns]
         date   요일  배추_거래량(kg)  배추_가격(원/kg)  무_거래량(kg)  무_가격(원/kg)  \
0  2020-09-29  화요일   2064183.3       1787.0  2424383.2      1190.0   

   양파_거래량(kg)  양파_가격(원/kg)  건고추_거래량(kg)  건고추_가격(원/kg)  ...  청상추_거래량(kg)  \
0   1209647.0        966.0       2842.8       25873.0  ...      76702.8   

   청상추_가격(원/kg)  백다다기_거래량(kg)  백다다기_가격(원/kg)  애호박_거래량(kg)  애호박_가격(원/kg) 

 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                | 24/38 [00:00<00:00, 63.26it/s]

          date   요일  배추_거래량(kg)  배추_가격(원/kg)  무_거래량(kg)  무_가격(원/kg)  \
0   2020-09-29  화요일   2064183.3       1787.0  2424383.2      1190.0   
1   2020-09-30  수요일    259065.3       1551.0  1044460.0       984.0   
2   2020-10-01  목요일         0.0          0.0        0.0         0.0   
3   2020-10-02  금요일         0.0          0.0        0.0         0.0   
4   2020-10-03  토요일         0.0          0.0     3100.0       835.0   
5   2020-10-04  일요일         0.0          0.0        0.0         0.0   
6   2020-10-05  월요일   1965239.2       1483.0  2041917.7      1043.0   
7   2020-10-06  화요일   1784379.1       1359.0  2064575.9       996.0   
8   2020-10-07  수요일   1672867.3       1316.0  2125164.2       901.0   
9   2020-10-08  목요일   1588132.2       1159.0  1780584.9       879.0   
10  2020-10-09  금요일   1591811.6       1013.0  1797501.3       749.0   
11  2020-10-10  토요일   1054162.8        909.0  1320399.7       694.0   
12  2020-10-11  일요일         0.0          0.0        0.0         0.0   
13  20

 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 31/38 [00:00<00:00, 56.81it/s]

          date   요일  배추_거래량(kg)  배추_가격(원/kg)  무_거래량(kg)  무_가격(원/kg)  \
0   2020-09-29  화요일   2064183.3       1787.0  2424383.2      1190.0   
1   2020-09-30  수요일    259065.3       1551.0  1044460.0       984.0   
2   2020-10-01  목요일         0.0          0.0        0.0         0.0   
3   2020-10-02  금요일         0.0          0.0        0.0         0.0   
4   2020-10-03  토요일         0.0          0.0     3100.0       835.0   
5   2020-10-04  일요일         0.0          0.0        0.0         0.0   
6   2020-10-05  월요일   1965239.2       1483.0  2041917.7      1043.0   
7   2020-10-06  화요일   1784379.1       1359.0  2064575.9       996.0   
8   2020-10-07  수요일   1672867.3       1316.0  2125164.2       901.0   
9   2020-10-08  목요일   1588132.2       1159.0  1780584.9       879.0   
10  2020-10-09  금요일   1591811.6       1013.0  1797501.3       749.0   
11  2020-10-10  토요일   1054162.8        909.0  1320399.7       694.0   
12  2020-10-11  일요일         0.0          0.0        0.0         0.0   
13  20

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:00<00:00, 56.43it/s]

          date   요일  배추_거래량(kg)  배추_가격(원/kg)  무_거래량(kg)  무_가격(원/kg)  \
0   2020-09-29  화요일   2064183.3       1787.0  2424383.2      1190.0   
1   2020-09-30  수요일    259065.3       1551.0  1044460.0       984.0   
2   2020-10-01  목요일         0.0          0.0        0.0         0.0   
3   2020-10-02  금요일         0.0          0.0        0.0         0.0   
4   2020-10-03  토요일         0.0          0.0     3100.0       835.0   
5   2020-10-04  일요일         0.0          0.0        0.0         0.0   
6   2020-10-05  월요일   1965239.2       1483.0  2041917.7      1043.0   
7   2020-10-06  화요일   1784379.1       1359.0  2064575.9       996.0   
8   2020-10-07  수요일   1672867.3       1316.0  2125164.2       901.0   
9   2020-10-08  목요일   1588132.2       1159.0  1780584.9       879.0   
10  2020-10-09  금요일   1591811.6       1013.0  1797501.3       749.0   
11  2020-10-10  토요일   1054162.8        909.0  1320399.7       694.0   
12  2020-10-11  일요일         0.0          0.0        0.0         0.0   
13  20


