## 딥러닝과 통계모델을 이용한 T-커머스 매출 예측

### Process
1. Sales record(실적 데이터)
2. weighted avverage of hourly product sales
3. sparsity control by Gaussian smoothing along time
4. Sparsity control by SVD
5. Decay by last sales day


In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def drop_unnamed(data):
    names = data.columns
    for name in names:
        if "Unnamed" in name:
            data.pop(name)
    return data

### Load Performance data

Dtypes check

In [3]:
pf = pd.read_csv("/home/yeeunlee/bigcon2020_parrot/prep/data/final_performance_v2.csv")
pf = drop_unnamed(pf)
pf.head(2)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,time,hour,prime,real_date,설명,IsHoliday,지속휴일수,TEMP,HUM,weekofyear
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,새해,1.0,1.0,-6.576974,63.524958,1
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,새해,1.0,1.0,-6.576974,63.524958,1


In [4]:
pf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35379 entries, 0 to 35378
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   방송일시        35379 non-null  object 
 1   노출(분)       35379 non-null  float64
 2   마더코드        35379 non-null  int64  
 3   상품코드        35379 non-null  int64  
 4   상품명         35379 non-null  object 
 5   상품군         35379 non-null  object 
 6   판매단가        35379 non-null  int64  
 7   취급액         35379 non-null  float64
 8   date        35379 non-null  object 
 9   week        35379 non-null  int64  
 10  time        35379 non-null  object 
 11  hour        35379 non-null  int64  
 12  prime       35379 non-null  int64  
 13  real_date   35379 non-null  object 
 14  설명          1672 non-null   object 
 15  IsHoliday   35379 non-null  float64
 16  지속휴일수       35379 non-null  float64
 17  TEMP        35379 non-null  float64
 18  HUM         35379 non-null  float64
 19  weekofyear  35379 non-nul

In [5]:
stage = pd.read_csv("/home/yeeunlee/bigcon2020_parrot/prep/stage_1.csv")
stage.head()

Unnamed: 0,방송일시,date,상품코드,노출(분),stage
0,2019-01-01 06:00:00,2019-01-01 00:00:00,201072,20.0,early
1,2019-01-01 06:20:00,2019-01-01 00:00:00,201072,20.0,mid
2,2019-01-01 06:40:00,2019-01-01 00:00:00,201072,20.0,late
3,2019-01-01 06:00:00,2019-01-01 00:00:00,201079,20.0,early
4,2019-01-01 06:20:00,2019-01-01 00:00:00,201079,20.0,mid


In [6]:
pf = pf.merge(stage[['방송일시', '상품코드', 'stage']], on = ['방송일시', '상품코드'])
pf.head(2)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,...,hour,prime,real_date,설명,IsHoliday,지속휴일수,TEMP,HUM,weekofyear,stage
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,2019-01-01 00:00:00,1,...,6,0,2019-01-01 00:00:00,새해,1.0,1.0,-6.576974,63.524958,1,early
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,2019-01-01 00:00:00,1,...,6,0,2019-01-01 00:00:00,새해,1.0,1.0,-6.576974,63.524958,1,early


### Encoding

timestamp는 object type으로 뜨지만 encoding으로 변환하면 안되므로, encoders를 구성할 때 object type columns를 미리 지정해준다.(obj)

In [7]:
import sklearn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import gc

In [8]:
# data type set
obj = ['마더코드', '상품코드', '상품군', 'stage', 'weekofyear']

for col in obj:
    pf[col] = pf[col].astype('U')

del pf['설명']

In [9]:
pf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35379 entries, 0 to 35378
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   방송일시        35379 non-null  object 
 1   노출(분)       35379 non-null  float64
 2   마더코드        35379 non-null  object 
 3   상품코드        35379 non-null  object 
 4   상품명         35379 non-null  object 
 5   상품군         35379 non-null  object 
 6   판매단가        35379 non-null  int64  
 7   취급액         35379 non-null  float64
 8   date        35379 non-null  object 
 9   week        35379 non-null  int64  
 10  time        35379 non-null  object 
 11  hour        35379 non-null  int64  
 12  prime       35379 non-null  int64  
 13  real_date   35379 non-null  object 
 14  IsHoliday   35379 non-null  float64
 15  지속휴일수       35379 non-null  float64
 16  TEMP        35379 non-null  float64
 17  HUM         35379 non-null  float64
 18  weekofyear  35379 non-null  object 
 19  stage       35379 non-nul

In [10]:
# Encoding into DECIMAL values
dtypes = pf.dtypes
encoders = {}

# 미리 지정한 obj 컬럼들에 대해서만 인코딩 진행
for column in obj:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(pf[column])
        encoders[column] = encoder

_data = pf.copy()
for column in encoders.keys():
    encoder = encoders[column]
    _data[column] = encoder.transform(pf[column])

In [11]:
_data.head(2)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,time,hour,prime,real_date,IsHoliday,지속휴일수,TEMP,HUM,weekofyear,stage
0,2019-01-01 06:00:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,2099000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,1.0,1.0,-6.576974,63.524958,0,0
1,2019-01-01 06:00:00,20.0,296,898,테이트 여성 셀린니트3종,6,39900,4371000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,1.0,1.0,-6.576974,63.524958,0,0


In [12]:
_data['상품코드'].max()

2030

In [13]:
len(_data)

35379

In [14]:
_data.describe()

Unnamed: 0,노출(분),마더코드,상품코드,상품군,판매단가,취급액,week,hour,prime,IsHoliday,지속휴일수,TEMP,HUM,weekofyear,stage
count,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0
mean,20.33936,322.54244,989.195427,5.072642,387859.0,23102410.0,3.002911,14.110574,0.351932,0.322479,0.708499,13.929747,62.541314,25.048814,0.973911
std,3.094151,204.420435,599.916027,3.05361,626408.8,20057900.0,2.010633,6.568274,0.47758,0.467432,1.115861,9.628727,18.167043,15.030482,0.832669
min,2.466667,0.0,0.0,0.0,12800.0,103000.0,0.0,0.0,0.0,0.0,0.0,-8.57764,16.928262,0.0,0.0
25%,20.0,122.0,447.0,3.0,59000.0,7998500.0,1.0,9.0,0.0,0.0,0.0,5.601739,49.20781,12.0,0.0
50%,20.0,301.0,951.0,5.0,99000.0,17326000.0,3.0,15.0,0.0,0.0,0.0,14.790834,63.574864,25.0,1.0
75%,20.0,490.0,1510.0,8.0,399000.0,32763000.0,5.0,20.0,1.0,1.0,2.0,22.279067,77.504486,38.0,2.0
max,40.0,686.0,2030.0,10.0,7930000.0,322009000.0,6.0,23.0,1.0,1.0,5.0,33.703554,96.144207,51.0,3.0


binary encoding을 진행하기 전에 최대값을 이진수로 바꾸었을 때 제일 긴 값의 길이가 몇인지 확인한다.

In [15]:
_data.describe()[obj].loc['max'].apply(lambda x: len(bin(int(x)).replace("0b", "")))

마더코드          10
상품코드          11
상품군            4
stage          2
weekofyear     6
Name: max, dtype: int64

In [16]:
# categorical variables to binary encoding
# bin(max(_data['상품명'])).replace("0b", '')

def binary_encoding(name, x):
    # x : column
    if name == '마더코드':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(10-len(value))+value))
    if name == '상품코드':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(11-len(value))+value))
    if name == 'stage':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(2-len(value))+value))
    if name == 'weekofyear':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(6-len(value))+value))
    else: # 상품군
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(4-len(value))+value))
    

In [17]:
# 마더코드
final = _data.copy()
final[['m_'+str(i) for i in range(10)]] = _data['마더코드'].apply(lambda x: binary_encoding('마더코드', x)).to_list()
# final.head(2)

In [18]:
# 상품코드
final[['p_'+str(i) for i in range(11)]] = _data['상품코드'].apply(lambda x: binary_encoding('상품코드', x)).to_list()
# final.head(2)

In [19]:
# 상품군
final[['g_'+str(i) for i in range(4)]] = _data['상품군'].apply(lambda x: binary_encoding('상품군', x)).to_list()
# final.head(2)

In [20]:
# stage
final[['s_'+str(i) for i in range(2)]] = _data['stage'].apply(lambda x: binary_encoding('stage', x)).to_list()


In [21]:
# weekofyear
final[['w_'+str(i) for i in range(6)]] = _data['weekofyear'].apply(lambda x: binary_encoding('weekofyear', x)).to_list()
final.head(2)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,...,g_2,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5
0,2019-01-01 06:00:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,2099000.0,2019-01-01 00:00:00,1,...,1,0,0,0,0,0,0,0,0,0
1,2019-01-01 06:00:00,20.0,296,898,테이트 여성 셀린니트3종,6,39900,4371000.0,2019-01-01 00:00:00,1,...,1,0,0,0,0,0,0,0,0,0


In [22]:
final.to_csv("encoded_data.csv", index = False)

In [22]:
train = final.drop(['마더코드', '상품코드', '상품명', '상품군'], axis = 1)
display(train.head(2))
train.describe() 

Unnamed: 0,방송일시,노출(분),판매단가,취급액,date,week,time,hour,prime,real_date,...,g_2,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5
0,2019-01-01 06:00:00,20.0,39900,2099000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,...,1,0,0,0,0,0,0,0,0,0
1,2019-01-01 06:00:00,20.0,39900,4371000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,...,1,0,0,0,0,0,0,0,0,0


Unnamed: 0,노출(분),판매단가,취급액,week,hour,prime,IsHoliday,지속휴일수,TEMP,HUM,...,g_2,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5
count,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,...,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0,35379.0
mean,20.33936,387859.0,23102410.0,3.002911,14.110574,0.351932,0.322479,0.708499,13.929747,62.541314,...,0.310071,0.588852,0.304446,0.365019,0.370446,0.384635,0.458775,0.465643,0.503265,0.501088
std,3.094151,626408.8,20057900.0,2.010633,6.568274,0.47758,0.467432,1.115861,9.628727,18.167043,...,0.462529,0.492049,0.460179,0.481442,0.482931,0.486516,0.498305,0.498825,0.499996,0.500006
min,2.466667,12800.0,103000.0,0.0,0.0,0.0,0.0,0.0,-8.57764,16.928262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,59000.0,7998500.0,1.0,9.0,0.0,0.0,0.0,5.601739,49.20781,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,20.0,99000.0,17326000.0,3.0,15.0,0.0,0.0,0.0,14.790834,63.574864,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,20.0,399000.0,32763000.0,5.0,20.0,1.0,1.0,2.0,22.279067,77.504486,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,40.0,7930000.0,322009000.0,6.0,23.0,1.0,1.0,5.0,33.703554,96.144207,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
train = train.drop(['방송일시', 'date', 'time', 'real_date'], axis = 1)
train.head(2)

Unnamed: 0,노출(분),판매단가,취급액,week,hour,prime,IsHoliday,지속휴일수,TEMP,HUM,...,g_2,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5
0,20.0,39900,2099000.0,1,6,0,1.0,1.0,-6.576974,63.524958,...,1,0,0,0,0,0,0,0,0,0
1,20.0,39900,4371000.0,1,6,0,1.0,1.0,-6.576974,63.524958,...,1,0,0,0,0,0,0,0,0,0


### Scale (Min-Max Scaler)

In [24]:
# scale dataset
scaler = MinMaxScaler()
cols = ['TEMP', 'HUM']
train[cols] = scaler.fit_transform(train[cols])

In [25]:
train.shape

(35379, 45)

## Modeling

In [26]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input
from tensorflow.keras.layers import Embedding, LSTM, concatenate
from tensorflow.keras.layers import Conv1D, MaxPool1D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import RMSprop
import tensorflow.keras.backend as K
print(tf.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


2.0.0-alpha0


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Dataset

In [27]:
X = train.drop(['취급액'], axis = 1)
Y = train[['취급액']]
print("data shape : ", X.shape)
print("label shape : " , Y.shape)

data shape :  (35379, 44)
label shape :  (35379, 1)


In [28]:
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.1, random_state = 123457)
print("x_train shape : ", x_train.shape)
print("y_train shape : ", y_train.shape)
print("x_valid shape : ", x_valid.shape)
print("y_valid shape : ", y_valid.shape)

x_train shape :  (31841, 44)
y_train shape :  (31841, 1)
x_valid shape :  (3538, 44)
y_valid shape :  (3538, 1)


## Filtering
- test data에서 기존 dataset에 학습된 기록이 있는지 검사하기(상품코드) - encoder에서 검사할 수 있을듯???
    - 그러면 애초에 train, valid나눈 상황에서 돌아가는지 확인해야 할듯, 지금은 encoding을 하고 나서 train, valid나누는 방식
    - 없을 때 : 1)마더코드 조회, 2)binary encoded 부분을 전부 0으로 채우고 진행한다.

In [None]:
# Encoder 조회


In [34]:
encoders['마더코드']

LabelEncoder()

## XGBoost

In [29]:
import xgboost as xgb
import seaborn as sns

In [30]:
xgb.__version__

'1.2.0'

In [31]:
model1 = xgb.XGBRegressor(learning_rate = 0.1,
                          max_depth = 20,
                          n_estimators = 100)

In [32]:
model1.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=20,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
# y_train = y_train.values.reshape(y_train.shape[0],)
y_valid = y_valid.values.reshape(y_valid.shape[0],)

In [None]:
sns.scatterplot(y_train, model1.predict(x_train))

In [None]:
y_pred = model1.predict(x_valid)
sns.scatterplot(y_valid, y_pred)

In [None]:
model1.score(x_valid, y_valid)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = check_array(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return ('MAPE', np.mean(np.abs((y_true - y_pred) / y_true)))

In [None]:
mean_absolute_percentage_error(y_valid, y_pred)

In [None]:
model2 = xgb.XGBRegressor(learning_rate = 0.1,
                          objective = mean_absolute_percentage_error,
                          max_depth = 20,
                          n_estimators = 100)

In [None]:
y_train =y_train[:, np.newaxis]
y_train.shape

In [None]:
model2.fit(x_train, y_train,
          eval_set = [(x_train, y_train), (x_valid, y_valid)])

In [None]:
model1.save_model("xgb20.json")

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth = 100,
                           random_state = 0)
rf.fit(x_train, y_train)

In [None]:
# sns.scatterplot(_data['취급액'], rf.predict(X))

In [None]:
rf_pred = rf.predict(x_valid)
mean_absolute_percentage_error(y_valid, rf_pred)

In [None]:
sns.scatterplot(y_valid, rf_pred)

In [None]:
rf.score(x_train, y_train)

In [None]:
rf.score(x_valid, y_valid)

## stack 3 RF models

In [None]:
rf_1 = RandomForestRegressor(max_depth = 30,
                           random_state = 1)
rf_1.fit(x_train, y_train)

In [None]:
rf_2 = RandomForestRegressor(max_depth = 20,
                           random_state = 0)
rf_2.fit(x_train, y_train)

In [None]:
rf1_pred = rf_1.predict(x_valid)
rf2_pred = rf_2.predict(x_valid)

In [None]:
rf_3 = (rf_pred+rf1_pred+rf2_pred) / 3
rf_3

In [None]:
mean_absolute_percentage_error(y_valid, rf_3)

In [None]:
sns.scatterplot(y_valid, rf_3)

## LightGBM