## 필터링 로직 추가

- 새로운 데이터의 상품코드가 기 학습된 인코더 안에 있는지 확인
    - 없을 경우 마더코드 검색
    - 동일 마더코드 안에서 동일 상품군 검색
    - 동일 상품군 안에서 단가 비교를 통한 대체 데이터 반환
- 일자와 날씨 등의 다른 데이터는 그대로 넣어준다

In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
def drop_unnamed(data):
    names = data.columns
    for name in names:
        if "Unnamed" in name:
            data.pop(name)
    return data

In [4]:
data = drop_unnamed(pd.read_csv("prep/data/final_performance_v4.csv"))
data.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
0,2019-01-01 06:00:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,2099000.0,2019-01-01 00:00:00,1,...,0,0,0,0,0,0,0,0,0,104.03
1,2019-01-01 06:00:00,20.0,296,898,테이트 여성 셀린니트3종,6,39900,4371000.0,2019-01-01 00:00:00,1,...,0,0,0,0,0,0,0,0,0,104.03
2,2019-01-01 06:20:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,3262000.0,2019-01-01 00:00:00,1,...,0,1,0,0,0,0,0,0,0,104.03
3,2019-01-01 06:20:00,20.0,296,898,테이트 여성 셀린니트3종,6,39900,6955000.0,2019-01-01 00:00:00,1,...,0,1,0,0,0,0,0,0,0,104.03
4,2019-01-01 06:40:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,6672000.0,2019-01-01 00:00:00,1,...,0,0,1,0,0,0,0,0,0,104.03


In [5]:
data.columns

Index(['방송일시', '노출(분)', '마더코드', '상품코드', '상품명', '상품군', '판매단가', '취급액', 'date',
       'week', 'time', 'hour', 'prime', 'real_date', 'IsHoliday', '지속휴일수',
       'TEMP', 'HUM', 'weekofyear', 'stage', 'm_0', 'm_1', 'm_2', 'm_3', 'm_4',
       'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'p_0', 'p_1', 'p_2', 'p_3', 'p_4',
       'p_5', 'p_6', 'p_7', 'p_8', 'p_9', 'p_10', 'g_0', 'g_1', 'g_2', 'g_3',
       's_0', 's_1', 'w_0', 'w_1', 'w_2', 'w_3', 'w_4', 'w_5', 'cpi'],
      dtype='object')

#### Binary encoded columns

- m_* : 마더코드
- p_* : 상품코드
- g_* : 상품군 
- s_* : stage
- w_* : weekofyear

In [1]:
stat_data = drop_unnamed(pd.read_csv("prep/data/final_performance_with_stat.csv"))
stat_data.head()

NameError: name 'drop_unnamed' is not defined

In [None]:
# import fitted Encoders
m_classes = np.load('reference/마더코드_classes.npy')
p_classes = np.load('reference/상품코드_classes.npy')
s_classes = np.load('reference/stage_classes.npy')
g_classes = np.load('reference/상품군_classes.npy')

In [None]:
g_classes

In [None]:
np.where(g_classes == '의류')[0][0]

In [None]:
X = data.drop(['방송일시', '상품명', '취급액', 'date', 'time', 'real_date'], axis = 1)
Y = data[['취급액']]

In [None]:
from sklearn.model_selection import train_test_split

# 기존 Decimal values를 포함한 데이터(for filtering작업)
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.1)
print(x_train.shape)
print(x_valid.shape)
print(y_train.shape)
print(y_valid.shape)

In [None]:
x_train.values[0]

In [None]:
x_train.head()

In [None]:
np.where(np.array(x_valid.columns) == '상품코드')[0][0]

In [None]:
train_p = x_train['상품코드'].unique()
train_m = x_train['마더코드'].unique()
train_g = x_train['상품군'].unique()

In [None]:
x_valid.head()

In [None]:
## Data filtering

final = []
revised = 0

values = x_valid.values 
for i in range(len(values)):
    # 상품코드가 x_train의 인코더 속에 존재하는가.(데이터 존재 여부 확인)
    if values[i][2] in train_p:
        final.append(values[i])
    else:
        # 마더코드가 x_train안에 있는가
        if values[i][1] in train_m:
            temp = x_train.loc[x_train['마더코드'].values == np.where(train_m == values[i][1])[0]]
            # 판매단가 차가 최소인 row 추가
            temp['sub'] = abs(temp['판매단가'] - values[i][4])
#             print(temp['sub'].idxmin())
            final.append(x_train.loc[temp['sub'].idxmin()].values)
            revised+=1
            print("revision_count : ", revised)
        else:
            # 상품군 확인
            if values[i][3] in train_g:
                temp = x_train.loc[x_train['상품군'].values == np.where(train_g == values[i][3])[0]]
                # 판매단가 차가 최소인 row 추가
                temp['sub'] = abs(temp['판매단가'] - values[i][4])
                final.append(x_train.loc[temp['sub'].idxmin()].values)
                revised+=1
                print("revision_count : ", revised)

In [None]:
np.where(np.array(x_stat_valid.columns) == '판매단가')[0][0]

In [None]:
## Data filtering
def filtering(train, test):
    final = []
    revised = 0
    
    # 학습데이터 카테고리 필터링
    train_p = train['상품코드'].unique()
    train_m = train['마더코드'].unique()
    train_g = train['상품군'].unique()
    
    values = test.values 
    
    # 각 카테고리별 컬럼 위치
    p_idx = np.where(np.array(test.columns) == '상품코드')[0][0]
    m_idx = np.where(np.array(test.columns) == '마더코드')[0][0]
    g_idx = np.where(np.array(test.columns) == '상품군')[0][0]
    _idx = np.where(np.array(test.columns) == '판매단가')[0][0]
    
    
    for i in range(len(values)):
        # 상품코드가 x_train의 인코더 속에 존재하는가.(데이터 존재 여부 확인)
        if values[i][p_idx] in train_p:
            final.append(values[i])
        else:
            # 마더코드가 x_train안에 있는가
            if values[i][m_idx] in train_m:
                temp = train.loc[train['마더코드'].values == np.where(train_m == values[i][m_idx])[0]]
                # 판매단가 차가 최소인 row 추가
                temp['sub'] = abs(temp['판매단가'] - values[i][_idx])
    
                final.append(train.loc[temp['sub'].idxmin()].values)
                revised+=1
                print("revision_count : ", revised)
            else:
                # 상품군 확인
                if values[i][g_idx] in train_g:
                    temp = train.loc[train['상품군'].values == np.where(train_g == values[i][g_idx])[0]]
                    # 판매단가 차가 최소인 row 추가
                    temp['sub'] = abs(temp['판매단가'] - values[i][4])
                    final.append(train.loc[temp['sub'].idxmin()].values)
                    revised+=1
                    print("revision_count : ", revised)
    return pd.DataFrame(final, columns = test.columns)

In [None]:
filtering(x_train, x_valid)

In [None]:
x_bin = x_train.drop(['마더코드', '상품코드', '상품군'], axis = 1)
x_bin_valid = pd.DataFrame(final, columns = x_valid.columns).drop(['마더코드', '상품코드', '상품군'], axis = 1)

print(x_bin.shape)
print(x_bin_valid.shape)

In [None]:
y_train = y_train.values

In [None]:
y_valid = y_valid.values

In [None]:
print(y_train.shape)
print(y_valid.shape)

## RandomForest with binary encoded data(with filtering)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(max_depth = 20,
                          random_state = 0,
                          verbose = 1)
rf.fit(x_bin, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
#     y_true, y_pred = check_array(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return ('MAPE', np.mean(np.abs((y_true - y_pred) / y_true))*100)

In [None]:
rf.score(x_train, y_train)

In [None]:
y_pred = rf.predict(x_bin_valid)
y_pred

In [None]:
y_pred = y_pred.reshape(-1, 1)
mean_absolute_percentage_error(y_valid, y_pred)

## RandomForest with category type data

In [None]:
x_train.columns

In [None]:
x_cat = x_train[['노출(분)', '마더코드', '상품코드', '상품군', '판매단가', 'week', 'hour',
                'prime', 'IsHoliday', '지속휴일수', 'TEMP', 'HUM', 'weekofyear', 'stage', 'cpi']]
x_cat_valid = pd.DataFrame(final, columns = x_valid.columns)[['노출(분)', '마더코드', '상품코드', '상품군', '판매단가', 
                                                            'week', 'hour', 'prime', 'IsHoliday', '지속휴일수', 'TEMP',
                                                            'HUM', 'weekofyear', 'stage', 'cpi']]

In [None]:
to_cat = ['상품코드', '상품군', 'week', 'hour', 'weekofyear', 'stage']

for var in to_cat:
    x_cat[var] = x_cat[var].astype('category')
    x_cat_valid[var] = x_cat_valid[var].astype('category')    

In [None]:
rf_cat = RandomForestRegressor(max_depth = 20,
                               n_jobs = -1,
                               n_estimators = 100,
                               random_state = 0,
                               verbose = 1)
rf_cat.fit(x_cat, y_train)

In [None]:
rf_cat.score(x_cat, y_train)

In [None]:
y_pred_cat = rf_cat.predict(x_cat_valid).reshape(-1, 1)
y_pred_cat

In [None]:
mean_absolute_percentage_error(y_valid, y_pred_cat)

## LightGBM

In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
# train_ds = lgb.Dataset(x_cat, label = y_train)
# valid_ds = lgb.Dataset(x_cat_valid, label = y_valid)

In [None]:
params = {'learning_rate': 0.1, 
          'max_depth': -1, 
          'boosting': 'dart', 
          'objective': 'regression', 
          'metric': 'mape', 
          'is_training_metric': True, 
          'num_leaves': 144, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':123457}

In [None]:
# 훈련
model = lgb.train(params, train_ds, 4000, valid_ds, verbose_eval=100, early_stopping_rounds=100)

In [None]:
predict_train = model.predict(x_cat).reshape(-1, 1)
predict_test = model.predict(x_cat_valid).reshape(-1, 1)

In [None]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
train_mape = MAPE(y_train, predict_train)
test_mape = MAPE(y_valid, predict_test)

print('train data MAPE: ', train_mape)
print('test data MAPE: ', test_mape)

## LightGBM with stat data

In [None]:
X_stat = stat_data.drop(['취급액', 'profit/m'], axis = 1)
Y_stat = np.log(stat_data['취급액'])
X_stat['TEMP'] = np.log(X_stat['TEMP'])
X_stat['HUM'] = np.log(X_stat['HUM'])

x_stat_train, x_stat_valid, y_stat_train, y_stat_valid = train_test_split(X_stat, Y_stat,
                                                                         test_size = 0.1,
                                                                         random_state = 123457)
print(x_stat_train.shape)
print(y_stat_train.shape)
print(x_stat_valid.shape)
print(y_stat_valid.shape)

In [None]:
# fiter validation
filtered_valid = filtering(x_stat_train, x_stat_valid)

In [None]:
x_stat_train = x_stat_train.drop(['마더코드', '상품코드'], axis = 1)
filtered_valid = filtered_valid.drop(['마더코드', '상품코드'], axis = 1)

In [None]:
filtered_valid.head()

In [None]:
to_cat = ['상품군', 'week', 'hour', 'weekofyear', 'stage']

for var in to_cat:
    x_stat_train[var] = x_stat_train[var].astype('category')
    filtered_valid[var] = filtered_valid[var].astype('category')   

In [None]:
stat_train_ds = lgb.Dataset(x_stat_train, label = y_stat_train)
stat_valid_ds = lgb.Dataset(filtered_valid, label = y_stat_valid)

params = {'learning_rate': 0.02, 
          'max_depth': -1, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mape', 
          'is_training_metric': True, 
          'num_leaves': 144, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':123457}

In [None]:
# 훈련
model_1 = lgb.train(params, stat_train_ds, 4000, stat_valid_ds,
                    verbose_eval=100, early_stopping_rounds = 100)

In [None]:
predict_train = model_1.predict(x_stat_train).reshape(-1, 1)
predict_test = model_1.predict(filtered_valid).reshape(-1, 1)

train_mape = MAPE(y_stat_train, predict_train)
test_mape = MAPE(y_stat_valid, predict_test)

print('train data MAPE: ', train_mape)
print('test data MAPE: ', test_mape)