## 필터링 로직 추가

- 새로운 데이터의 상품코드가 기 학습된 인코더 안에 있는지 확인
    - 없을 경우 마더코드 검색
    - 동일 마더코드 안에서 동일 상품군 검색
    - 동일 상품군 안에서 단가 비교를 통한 대체 데이터 반환
- 일자와 날씨 등의 다른 데이터는 그대로 넣어준다

In [1]:
import numpy as np
import pandas as pd
import os

In [3]:
def drop_unnamed(data):
    names = data.columns
    for name in names:
        if "Unnamed" in name:
            data.pop(name)
    return data

In [4]:
data = drop_unnamed(pd.read_csv("prep/data/final_performance_v4.csv"))
data.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
0,2019-01-01 06:00:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,2099000.0,2019-01-01 00:00:00,1,...,0,0,0,0,0,0,0,0,0,104.03
1,2019-01-01 06:00:00,20.0,296,898,테이트 여성 셀린니트3종,6,39900,4371000.0,2019-01-01 00:00:00,1,...,0,0,0,0,0,0,0,0,0,104.03
2,2019-01-01 06:20:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,3262000.0,2019-01-01 00:00:00,1,...,0,1,0,0,0,0,0,0,0,104.03
3,2019-01-01 06:20:00,20.0,296,898,테이트 여성 셀린니트3종,6,39900,6955000.0,2019-01-01 00:00:00,1,...,0,1,0,0,0,0,0,0,0,104.03
4,2019-01-01 06:40:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,6672000.0,2019-01-01 00:00:00,1,...,0,0,1,0,0,0,0,0,0,104.03


In [5]:
data.columns

Index(['방송일시', '노출(분)', '마더코드', '상품코드', '상품명', '상품군', '판매단가', '취급액', 'date',
       'week', 'time', 'hour', 'prime', 'real_date', 'IsHoliday', '지속휴일수',
       'TEMP', 'HUM', 'weekofyear', 'stage', 'm_0', 'm_1', 'm_2', 'm_3', 'm_4',
       'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'p_0', 'p_1', 'p_2', 'p_3', 'p_4',
       'p_5', 'p_6', 'p_7', 'p_8', 'p_9', 'p_10', 'g_0', 'g_1', 'g_2', 'g_3',
       's_0', 's_1', 'w_0', 'w_1', 'w_2', 'w_3', 'w_4', 'w_5', 'cpi'],
      dtype='object')

#### Binary encoded columns

- m_* : 마더코드
- p_* : 상품코드
- g_* : 상품군 
- s_* : stage
- w_* : weekofyear

In [6]:
# import fitted Encoders
m_classes = np.load('reference/마더코드_classes.npy')
p_classes = np.load('reference/상품코드_classes.npy')
s_classes = np.load('reference/stage_classes.npy')
g_classes = np.load('reference/상품군_classes.npy')

In [7]:
p_classes

array(['200000', '200001', '200002', ..., '202510', '202512', '202513'],
      dtype=object)

In [8]:
np.where(g_classes == '의류')[0][0]

6

In [71]:
X = data.drop(['방송일시', '상품명', '취급액', 'date', 'time', 'real_date'], axis = 1)
Y = data[['취급액']]

In [74]:
from sklearn.model_selection import train_test_split

# 기존 Decimal values를 포함한 데이터(for filtering작업)
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.1)
print(x_train.shape)
print(x_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(31841, 48)
(3538, 48)
(31841, 1)
(3538, 1)


In [75]:
x_train.values[0]

array([2.00000000e+01, 9.60000000e+01, 2.42000000e+02, 2.00000000e+00,
       7.70000000e+04, 3.00000000e+00, 1.60000000e+01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.26487264e+01, 2.59152893e+01,
       9.00000000e+00, 2.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.05290000e+02])

In [76]:
x_train.head()

Unnamed: 0,노출(분),마더코드,상품코드,상품군,판매단가,week,hour,prime,IsHoliday,지속휴일수,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
11801,20.0,96,242,2,77000,3,16,0,0.0,0.0,...,0,1,0,0,0,1,0,0,1,105.29
31959,20.0,677,1990,9,218000,0,13,0,0.0,0.0,...,1,1,0,1,0,1,0,1,0,105.37
29919,20.0,122,375,1,1499000,5,21,1,1.0,2.0,...,1,1,0,1,0,0,1,1,0,105.37
1367,20.0,70,177,9,60900,2,8,0,0.0,0.0,...,1,0,0,0,1,0,1,1,0,104.03
14574,20.0,149,494,8,48900,1,8,0,0.0,0.0,...,0,0,0,0,0,1,1,1,0,105.29


In [77]:
train_p = x_train['상품코드'].unique()
train_m = x_train['마더코드'].unique()
train_g = x_train['상품군'].unique()

In [78]:
x_valid.head()

Unnamed: 0,노출(분),마더코드,상품코드,상품군,판매단가,week,hour,prime,IsHoliday,지속휴일수,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
16885,30.0,260,818,5,89000,4,9,0,0.0,0.0,...,1,0,0,0,1,0,0,0,1,105.19
10641,20.0,212,695,6,69900,5,23,0,1.0,2.0,...,0,1,0,0,0,0,1,1,1,104.81
27420,20.0,122,375,1,1499000,6,22,1,1.0,2.0,...,1,0,1,1,0,0,0,1,0,105.96
273,20.0,618,1799,9,238000,4,9,0,0.0,0.0,...,1,0,1,0,0,0,0,0,0,104.03
795,30.0,260,805,5,109000,3,9,0,0.0,0.0,...,1,0,1,0,0,1,0,1,1,104.03


In [79]:
x_train.loc[x_train['마더코드'].values == np.where(train_m == values[i][1] for i in range(len(values)))[0]]

Unnamed: 0,노출(분),마더코드,상품코드,상품군,판매단가,week,hour,prime,IsHoliday,지속휴일수,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
23315,20.0,0,0,7,79900,6,0,0,0.0,0.0,...,1,1,0,0,1,1,0,1,1,104.73
23654,15.0,0,0,7,79900,3,1,0,0.0,0.0,...,1,0,1,0,1,1,1,0,0,104.73
23314,20.0,0,0,7,79900,6,0,0,0.0,0.0,...,1,0,0,0,1,1,0,1,1,104.73
23652,20.0,0,0,7,79900,3,1,0,0.0,0.0,...,1,0,0,0,1,1,1,0,0,104.73
23316,20.0,0,0,7,79900,6,1,0,0.0,0.0,...,1,0,1,0,1,1,0,1,1,104.73
23653,20.0,0,0,7,79900,3,1,0,0.0,0.0,...,1,1,0,0,1,1,1,0,0,104.73


In [80]:
## Data filtering

final = []
revised = 0

values = x_valid.values 
for i in range(len(values)):
    # 상품코드가 x_train의 인코더 속에 존재하는가.(데이터 존재 여부 확인)
    if values[i][2] in train_p:
        final.append(values[i])
    else:
        # 마더코드가 x_train안에 있는가
        if values[i][1] in train_m:
            temp = x_train.loc[x_train['마더코드'].values == np.where(train_m == values[i][1])[0]]
            # 판매단가 차가 최소인 row 추가
            temp['sub'] = abs(temp['판매단가'] - values[i][4])
            final.append(x_train.iloc[temp['sub'].idxmin()].values)
            revised+=1
            print("revision_count : ", revised)
        else:
            # 상품군 확인
            if values[i][3] in train_g:
                temp = x_train.loc[x_train['상품군'].values == np.where(train_g == values[i][3])[0]]
                # 판매단가 차가 최소인 row 추가
                temp['sub'] = abs(temp['판매단가'] - values[i][4])
                final.append(x_train.iloc[temp['sub'].idxmin()].values)
                revised+=1
                print("revision_count : ", revised)

revision_count :  1
revision_count :  2
revision_count :  3
revision_count :  4
revision_count :  5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [81]:
pd.DataFrame(final, columns = x_valid.columns)

Unnamed: 0,노출(분),마더코드,상품코드,상품군,판매단가,week,hour,prime,IsHoliday,지속휴일수,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
0,30.0,260.0,818.0,5.0,89000.0,4.0,9.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,105.19
1,20.0,212.0,695.0,6.0,69900.0,5.0,23.0,0.0,1.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,104.81
2,20.0,122.0,375.0,1.0,1499000.0,6.0,22.0,1.0,1.0,2.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,105.96
3,20.0,618.0,1799.0,9.0,238000.0,4.0,9.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,104.03
4,30.0,260.0,805.0,5.0,109000.0,3.0,9.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,104.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3533,20.0,155.0,528.0,5.0,49900.0,6.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,105.57
3534,20.0,100.0,249.0,6.0,99000.0,6.0,8.0,1.0,1.0,2.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,105.51
3535,20.0,272.0,836.0,3.0,59900.0,4.0,17.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,104.73
3536,20.0,285.0,865.0,6.0,59000.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,104.81


In [82]:
x_bin = x_train.drop(['마더코드', '상품코드', '상품군'], axis = 1)
x_bin_valid = pd.DataFrame(final, columns = x_valid.columns).drop(['마더코드', '상품코드', '상품군'], axis = 1)

print(x_bin.shape)
print(x_bin_valid.shape)

(31841, 45)
(3538, 45)


In [62]:
y_train = y_train.values

In [64]:
y_valid = y_valid.values

In [65]:
print(y_train.shape)
print(y_valid.shape)

(31841, 1)
(3538, 1)


## RandomForest with binary encoded data(with filtering)

In [52]:
from sklearn.ensemble import RandomForestRegressor

In [66]:
rf = RandomForestRegressor(max_depth = 20,
                          random_state = 0,
                          verbose = 1)
rf.fit(x_bin, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(max_depth=10, random_state=0)

In [67]:
from sklearn.metrics import accuracy_score
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
#     y_true, y_pred = check_array(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return ('MAPE', np.mean(np.abs((y_true - y_pred) / y_true))*100)

In [68]:
rf.score(x_train, y_train)

0.7322769423540453

In [69]:
y_pred = rf.predict(x_bin_valid)
y_pred

array([31312948.00358587, 23489780.66388956,  6198938.29659366, ...,
       10127351.05078098,  8448804.56036899, 41194940.54663106])

In [70]:
y_pred = y_pred.reshape(-1, 1)
mean_absolute_percentage_error(y_valid, y_pred)

('MAPE', 84.53115044351969)

## RandomForest with category type data

In [83]:
x_train.columns

Index(['노출(분)', '마더코드', '상품코드', '상품군', '판매단가', 'week', 'hour', 'prime',
       'IsHoliday', '지속휴일수', 'TEMP', 'HUM', 'weekofyear', 'stage', 'm_0',
       'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'p_0',
       'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6', 'p_7', 'p_8', 'p_9', 'p_10',
       'g_0', 'g_1', 'g_2', 'g_3', 's_0', 's_1', 'w_0', 'w_1', 'w_2', 'w_3',
       'w_4', 'w_5', 'cpi'],
      dtype='object')

In [84]:
x_cat = x_train[['노출(분)', '마더코드', '상품코드', '상품군', '판매단가', 'week', 'hour',
                'prime', 'IsHoliday', '지속휴일수', 'TEMP', 'HUM', 'weekofyear', 'stage', 'cpi']]
x_cat_valid = pd.DataFrame(final, columns = x_valid.columns)[['노출(분)', '마더코드', '상품코드', '상품군', '판매단가', 
                                                            'week', 'hour', 'prime', 'IsHoliday', '지속휴일수', 'TEMP',
                                                            'HUM', 'weekofyear', 'stage', 'cpi']]

In [85]:
to_cat = ['상품코드', '상품군', 'week', 'hour', 'weekofyear', 'stage']

for var in to_cat:
    x_cat[var] = x_cat[var].astype('category')
    x_cat_valid[var] = x_cat_valid[var].astype('category')    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [86]:
rf_cat = RandomForestRegressor(max_depth = 20,
                               n_jobs = -1,
                               n_estimators = 100,
                               random_state = 0,
                               verbose = 1)
rf_cat.fit(x_cat, y_train)

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished


RandomForestRegressor(max_depth=20, n_jobs=-1, random_state=0, verbose=1)

In [88]:
rf_cat.score(x_cat, y_train)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished


0.952252199710437

In [91]:
y_pred_cat = rf_cat.predict(x_cat_valid).reshape(-1, 1)
y_pred_cat

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


array([[29277770.        ],
       [21736963.31993829],
       [52547575.81144831],
       ...,
       [42932511.36284532],
       [13126605.42780812],
       [33402223.17698753]])

In [92]:
mean_absolute_percentage_error(y_valid, y_pred_cat)

('MAPE',
 취급액    60.759769
 dtype: float64)

## LightGBM

In [93]:
import lightgbm as lgb

train_ds = lgb.Dataset(x_cat, label = y_train)
valid_ds = lgb.Dataset(x_cat_valid, label = y_valid)

In [94]:
params = {'learning_rate': 0.1, 
          'max_depth': -1, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mape', 
          'is_training_metric': True, 
          'num_leaves': 144, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':123457}

In [95]:
# 훈련
model = lgb.train(params, train_ds, 100000, valid_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2886
[LightGBM] [Info] Number of data points in the train set: 31841, number of used features: 15
[LightGBM] [Info] Start training from score 23149447.819855
Training until validation scores don't improve for 100 rounds




[100]	valid_0's mape: 0.499967
[200]	valid_0's mape: 0.476573
[300]	valid_0's mape: 0.466167
[400]	valid_0's mape: 0.460855
[500]	valid_0's mape: 0.460817
Early stopping, best iteration is:
[435]	valid_0's mape: 0.45912


In [96]:
predict_train = model.predict(x_cat).reshape(-1, 1)
predict_test = model.predict(x_cat_valid).reshape(-1, 1)

In [97]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [98]:
train_mape = MAPE(y_train, predict_train)
test_mape = MAPE(y_valid, predict_test)

print('train data MAPE: ', train_mape)
print('test data MAPE: ', test_mape)

train data MAPE:  24.407314032639395
test data MAPE:  45.91196680066441
