## 필터링 로직 추가

- 새로운 데이터의 상품코드가 기 학습된 인코더 안에 있는지 확인
    - 없을 경우 마더코드 검색
    - 동일 마더코드 안에서 동일 상품군 검색
    - 동일 상품군 안에서 단가 비교를 통한 대체 데이터 반환
- 일자와 날씨 등의 다른 데이터는 그대로 넣어준다

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def drop_unnamed(data):
    names = data.columns
    for name in names:
        if "Unnamed" in name:
            data.pop(name)
    return data

#### Binary encoded columns

- m_* : 마더코드
- p_* : 상품코드
- g_* : 상품군 
- s_* : stage
- w_* : weekofyear

In [3]:
stat_data = drop_unnamed(pd.read_csv("prep/data/final_performance_with_stat.csv"))
stat_data.head()

Unnamed: 0,마더코드,상품코드,상품군,week,hour,weekofyear,stage,prime,IsHoliday,노출(분),판매단가,취급액,지속휴일수,TEMP,HUM,cpi,profit/m,avgp,minp,maxp
0,296,891,6,1,6,0,0,0,1.0,20.0,39900,2099000.0,1.0,-6.576974,63.524958,104.03,104950.0,298430.0,104950.0,524050.0
1,296,891,6,1,6,0,2,0,1.0,20.0,39900,3262000.0,1.0,-6.576974,63.524958,104.03,163100.0,298430.0,104950.0,524050.0
2,296,891,6,1,6,0,1,0,1.0,20.0,39900,6672000.0,1.0,-6.576974,63.524958,104.03,333600.0,298430.0,104950.0,524050.0
3,296,891,6,2,0,0,0,0,0.0,20.0,39900,7329000.0,0.0,-5.089473,54.841105,104.03,366450.0,298430.0,104950.0,524050.0
4,296,891,6,2,0,0,1,0,0.0,20.0,39900,10481000.0,0.0,-5.089473,54.841105,104.03,524050.0,298430.0,104950.0,524050.0


In [4]:
stat_data.isna().sum()

마더코드          0
상품코드          0
상품군           0
week          0
hour          0
weekofyear    0
stage         0
prime         0
IsHoliday     0
노출(분)         0
판매단가          0
취급액           0
지속휴일수         0
TEMP          0
HUM           0
cpi           0
profit/m      0
avgp          0
minp          0
maxp          0
dtype: int64

In [5]:
# import fitted Encoders
m_classes = np.load('reference/마더코드_classes.npy')
p_classes = np.load('reference/상품코드_classes.npy')
s_classes = np.load('reference/stage_classes.npy')
g_classes = np.load('reference/상품군_classes.npy')

In [6]:
g_classes

array(['가구', '가전', '건강기능', '농수축', '생활용품', '속옷', '의류', '이미용', '잡화', '주방',
       '침구'], dtype=object)

In [7]:
# categorical variables to binary encoding
# bin(max(_data['상품명'])).replace("0b", '')

def binary_encoding(name, x):
    # x : column
    if name == '마더코드':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(10-len(value))+value))
    if name == '상품코드':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(11-len(value))+value))
    if name == 'week':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(3-len(value))+value))
    if name == 'hour':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(5-len(value))+value))
    if name == 'weekofyear':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(6-len(value))+value))
    else: # 상품군
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(4-len(value))+value))
    

In [8]:
# 마더코드
final = stat_data.copy()
final[['m_'+str(i) for i in range(10)]] = final['마더코드'].apply(lambda x: binary_encoding('마더코드', x)).to_list()

# 상품코드
final[['p_'+str(i) for i in range(11)]] = final['상품코드'].apply(lambda x: binary_encoding('상품코드', x)).to_list()

# 상품군
final[['g_'+str(i) for i in range(4)]] = final['상품군'].apply(lambda x: binary_encoding('상품군', x)).to_list()

# week
final[['wd_'+str(i) for i in range(3)]] = final['week'].apply(lambda x: binary_encoding('week', x)).to_list()

# weekofyear
final[['wy_'+str(i) for i in range(6)]] = final['weekofyear'].apply(lambda x: binary_encoding('weekofyear', x)).to_list()

# hour
final[['h_'+str(i) for i in range(5)]] = final['hour'].apply(lambda x: binary_encoding('hour', x)).to_list()

print(final.columns)
final.head(2)

Index(['마더코드', '상품코드', '상품군', 'week', 'hour', 'weekofyear', 'stage', 'prime',
       'IsHoliday', '노출(분)', '판매단가', '취급액', '지속휴일수', 'TEMP', 'HUM', 'cpi',
       'profit/m', 'avgp', 'minp', 'maxp', 'm_0', 'm_1', 'm_2', 'm_3', 'm_4',
       'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'p_0', 'p_1', 'p_2', 'p_3', 'p_4',
       'p_5', 'p_6', 'p_7', 'p_8', 'p_9', 'p_10', 'g_0', 'g_1', 'g_2', 'g_3',
       'wd_0', 'wd_1', 'wd_2', 'wy_0', 'wy_1', 'wy_2', 'wy_3', 'wy_4', 'wy_5',
       'h_0', 'h_1', 'h_2', 'h_3', 'h_4'],
      dtype='object')


Unnamed: 0,마더코드,상품코드,상품군,week,hour,weekofyear,stage,prime,IsHoliday,노출(분),...,wy_1,wy_2,wy_3,wy_4,wy_5,h_0,h_1,h_2,h_3,h_4
0,296,891,6,1,6,0,0,0,1.0,20.0,...,0,0,0,0,0,0,0,1,1,0
1,296,891,6,1,6,0,2,0,1.0,20.0,...,0,0,0,0,0,0,0,1,1,0


In [9]:
from sklearn.model_selection import train_test_split

In [95]:
X_stat = final.drop(['취급액', 'profit/m'], axis = 1)
Y_stat = final['취급액']

x_stat_train, x_stat_valid, y_stat_train, y_stat_valid = train_test_split(X_stat, Y_stat,
                                                                         test_size = 0.1,
                                                                         random_state = 123457)
print(x_stat_train.shape)
print(y_stat_train.shape)
print(x_stat_valid.shape)
print(y_stat_valid.shape)

(31841, 57)
(31841,)
(3538, 57)
(3538,)


In [96]:
## Data filtering
def filtering(train, test):
    final = []
    revised = 0
    
    # 바꾸지 않을 컬럼 명
    rem_cols = ['week', 'hour', 'weekofyear', 'TEMP', 'HUM', 'cpi']
    idxs = []
    test = test.reset_index(drop=True)
    
    # 학습데이터 카테고리 필터링
    train_p = train['상품코드'].unique()
    train_m = train['마더코드'].unique()
    train_g = train['상품군'].unique()
    
    for i in tqdm(range(len(test))):
        # 상품코드가 x_train의 인코더 속에 존재하는가.(데이터 존재 여부 확인)
        if test['상품코드'].iloc[i] in train_p:
            final.append(test.iloc[i].values)
        else:
            # 마더코드가 x_train안에 있는가
            if test['마더코드'].iloc[i] in train_m:
                temp = train.loc[train['마더코드'].values == np.where(train_m == test['마더코드'].iloc[i])[0]]
                # 판매단가 차가 최소인 row 추가
                temp['sub'] = abs(temp['판매단가'] - test['판매단가'].iloc[i])
                final.append(train.loc[temp['sub'].idxmin()].values)
                idxs.append(i)
                revised+=1
                print("revision_count : ", revised)
            else:
                # 상품군 확인
                if test['상품군'].iloc[i] in train_g:
                    temp = train.loc[train['상품군'].values == np.where(train_g == test['상품군'].iloc[i])[0]]
                    # 판매단가 차가 최소인 row 추가
                    temp['sub'] = abs(temp['판매단가'] - test['판매단가'].iloc[i])
                    final.append(train.loc[temp['sub'].idxmin()].values)
                    idxs.append(i)
                    revised+=1
                    print("revision_count : ", revised)
    out = pd.DataFrame(final, columns = test.columns, index = test.index)
    print(out.index)
    print(idxs)
    out[rem_cols].loc[idxs] = out[rem_cols].update(test[rem_cols].loc[idxs])
    return out

In [97]:
filtered = filtering(x_stat_train, x_stat_valid)

HBox(children=(FloatProgress(value=0.0, max=3538.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


revision_count :  1
revision_count :  2
revision_count :  3
revision_count :  4
revision_count :  5
revision_count :  6

RangeIndex(start=0, stop=3538, step=1)
[1121, 1125, 2248, 2419, 2747, 3180]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [85]:
filtered.head()

Unnamed: 0,마더코드,상품코드,상품군,week,hour,weekofyear,stage,prime,IsHoliday,노출(분),...,wy_1,wy_2,wy_3,wy_4,wy_5,h_0,h_1,h_2,h_3,h_4
0,250.0,786.0,8.0,5.0,23.0,22.0,1.0,0.0,1.0,20.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
1,137.0,464.0,3.0,3.0,17.0,28.0,1.0,0.0,0.0,20.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,293.0,882.0,6.0,3.0,1.0,18.0,1.0,0.0,0.0,13.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,233.0,744.0,5.0,1.0,6.0,24.0,0.0,0.0,0.0,20.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,30.0,73.0,8.0,5.0,8.0,9.0,2.0,1.0,1.0,20.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [98]:
x_stat_train = x_stat_train.drop(['마더코드', '상품코드', '상품군', 'week', 'hour',
                                  'stage', 'prime', 'IsHoliday'], axis = 1)
filtered = filtered.drop(['마더코드', '상품코드', '상품군', 'week', 'hour',
                          'stage', 'prime', 'IsHoliday'], axis = 1)

In [87]:
x_stat_train = x_stat_train.values.reshape(-1, x_stat_train.shape[1], 1)
filtered = filtered.values.reshape(-1, filtered.shape[1], 1)

In [88]:
print(y_stat_train.shape)
print(y_stat_valid.shape)

(31841,)
(3538,)


## Deep Regression

In [89]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Conv1D, MaxPool1D, Flatten
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
tf.__version__

'2.0.0-alpha0'

In [90]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', 
                                            patience=3, 
                                            verbose=1,
                                            factor=0.1, 
                                            min_lr=0.000001)

In [91]:
tf.keras.backend.clear_session()
dnn = Sequential()
dnn.add(Dense(200, input_shape = (x_stat_train.shape[1], ), kernel_initializer = 'normal',
              kernel_regularizer = l2(0.001), activation = 'relu'))
dnn.add(Dense(49, kernel_initializer = 'normal', 
              kernel_regularizer = l2(0.001), activation = 'relu'))
dnn.add(Dense(20, kernel_initializer = 'normal', 
              kernel_regularizer = l2(0.001), activation = 'relu'))
dnn.add(Dense(1, activation = 'linear'))

dnn.compile(optimizer = RMSprop(lr=0.1),
           loss = 'mean_absolute_percentage_error')
dnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               10000     
_________________________________________________________________
dense_1 (Dense)              (None, 49)                9849      
_________________________________________________________________
dense_2 (Dense)              (None, 20)                1000      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 21        
Total params: 20,870
Trainable params: 20,870
Non-trainable params: 0
_________________________________________________________________


In [92]:
filtered.shape

(3538, 49, 1)

In [None]:
tf.keras.backend.clear_session()
cnn = Sequential()
cnn.add(Conv1D(filters = 128, input_shape = (x_stat_train.shape[1], 1),
               kernel_size = 2, activation = 'relu',
               kernel_initializer = 'normal', kernel_regularizer = l2(0.001)))
cnn.add(MaxPool1D(pool_size = 1))
cnn.add(Dense(49, activation = 'relu'))
cnn.add(Flatten())
cnn.add(Dense(1))

cnn.compile(optimizer = 'adam',
           loss = 'mean_absolute_percentage_error')
cnn.summary()

In [99]:
hist = dnn.fit(x_stat_train, y_stat_train,
               validation_data = (filtered, y_stat_valid),
              epochs = 200, verbose = 1,
              callbacks = [learning_rate_reduction])

Train on 31841 samples, validate on 3538 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.010000000149011612.
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 00022: ReduceLROnPlateau reducing learning rate to 9.999999310821295e-05.
Epoch 23/200
Epoch 24/200

KeyboardInterrupt: 

## LightGBM

In [100]:
import lightgbm as lgb

In [101]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## stat data(categorical values)

In [102]:
X_stat = stat_data.drop(['취급액', 'profit/m'], axis = 1)
Y_stat = stat_data['취급액']

x_stat_train, x_stat_valid, y_stat_train, y_stat_valid = train_test_split(X_stat, Y_stat,
                                                                         test_size = 0.1,
                                                                         random_state = 123457)
print(x_stat_train.shape)
print(y_stat_train.shape)
print(x_stat_valid.shape)
print(y_stat_valid.shape)

(31841, 18)
(31841,)
(3538, 18)
(3538,)


In [103]:
# fiter validation
filtered_valid = filtering(x_stat_train, x_stat_valid)

HBox(children=(FloatProgress(value=0.0, max=3538.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


revision_count :  1
revision_count :  2
revision_count :  3
revision_count :  4
revision_count :  5
revision_count :  6

RangeIndex(start=0, stop=3538, step=1)
[1121, 1125, 2248, 2419, 2747, 3180]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [104]:
x_stat_train = x_stat_train.drop(['마더코드', '상품코드'], axis = 1)
filtered_valid = filtered_valid.drop(['마더코드', '상품코드'], axis = 1)

In [105]:
filtered_valid.head()

Unnamed: 0,상품군,week,hour,weekofyear,stage,prime,IsHoliday,노출(분),판매단가,지속휴일수,TEMP,HUM,cpi,avgp,minp,maxp
0,8.0,5.0,23.0,22.0,1.0,0.0,1.0,20.0,139000.0,2.0,3.204077,70.246698,104.03,463636.2,64800.0,1413800.0
1,3.0,3.0,17.0,28.0,1.0,0.0,0.0,20.0,60900.0,0.0,24.053788,75.523431,104.73,2676412.0,832800.0,5878000.0
2,6.0,3.0,1.0,18.0,1.0,0.0,0.0,13.0,39900.0,0.0,22.176936,84.396548,105.19,879560.3,704000.0,1214231.0
3,5.0,1.0,6.0,24.0,0.0,0.0,0.0,20.0,49900.0,0.0,25.293952,86.402449,104.4,2031819.0,521700.0,4287600.0
4,8.0,5.0,8.0,9.0,2.0,1.0,1.0,20.0,790000.0,3.0,14.156622,62.723329,105.29,817854.2,86350.0,2007400.0


In [106]:
to_cat = ['상품군', 'week', 'hour', 'weekofyear', 'stage', 'prime', 'IsHoliday']

for var in to_cat:
    x_stat_train[var] = x_stat_train[var].astype('category')
    filtered_valid[var] = filtered_valid[var].astype('category')   

In [109]:
stat_train_ds = lgb.Dataset(x_stat_train, label = y_stat_train)
stat_valid_ds = lgb.Dataset(filtered_valid, label = y_stat_valid)

params = {'learning_rate': 0.05, 
          'max_depth': -1, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mape', 
          'is_training_metric': True, 
          'num_leaves': 144, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':123457}

In [110]:
# 훈련
model_1 = lgb.train(params, stat_train_ds, 4000, stat_valid_ds,
                    verbose_eval=100, early_stopping_rounds = 200)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 31841, number of used features: 16
[LightGBM] [Info] Start training from score 23054854.151314
Training until validation scores don't improve for 200 rounds
[100]	valid_0's mape: 0.36804
[200]	valid_0's mape: 0.336559
[300]	valid_0's mape: 0.330492
[400]	valid_0's mape: 0.328147
[500]	valid_0's mape: 0.326835
[600]	valid_0's mape: 0.326875
[700]	valid_0's mape: 0.327863
Early stopping, best iteration is:
[582]	valid_0's mape: 0.326157


In [None]:
predict_train = model_1.predict(x_stat_train).reshape(-1, 1)
predict_test = model_1.predict(filtered_valid).reshape(-1, 1)

train_mape = MAPE(y_stat_train, predict_train)
test_mape = MAPE(y_stat_valid, predict_test)

print('train data MAPE: ', train_mape)
print('test data MAPE: ', test_mape)