## 딥러닝과 통계모델을 이용한 T-커머스 매출 예측

### Process
1. Sales record(실적 데이터)
2. weighted avverage of hourly product sales
3. sparsity control by Gaussian smoothing along time
4. Sparsity control by SVD
5. Decay by last sales day


In [1]:
import numpy as np
import pandas as pd
import os

In [4]:
def drop_unnamed(data):
    names = data.columns
    for name in names:
        if "Unnamed" in name:
            data.pop(name)
    return data

In [5]:
pf = pd.read_csv("/home/yeeunlee/bigcon2020_parrot/prep/data/final_performance.csv")
pf = drop_unnamed(pf)
pf.head(2)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,time,hour,prime,real_date,설명,IsHoliday,지속휴일수,TEMP,HUM
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,새해,1.0,1.0,-6.576974,63.524958
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,2019-01-01 00:00:00,1,06:00:00,6,0,2019-01-01 00:00:00,새해,1.0,1.0,-6.576974,63.524958


In [6]:
pf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35284 entries, 0 to 35283
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   방송일시       35284 non-null  object 
 1   노출(분)      35284 non-null  float64
 2   마더코드       35284 non-null  int64  
 3   상품코드       35284 non-null  int64  
 4   상품명        35284 non-null  object 
 5   상품군        35284 non-null  object 
 6   판매단가       35284 non-null  int64  
 7   취급액        35284 non-null  float64
 8   date       35284 non-null  object 
 9   week       35284 non-null  int64  
 10  time       35284 non-null  object 
 11  hour       35284 non-null  int64  
 12  prime      35284 non-null  int64  
 13  real_date  35284 non-null  object 
 14  설명         1668 non-null   object 
 15  IsHoliday  35284 non-null  float64
 16  지속휴일수      35284 non-null  float64
 17  TEMP       35284 non-null  float64
 18  HUM        35284 non-null  float64
dtypes: float64(6), int64(6), object(7)
memory usag

In [32]:
import sklearn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm.auto import tqdm
import gc

### Encoding

In [16]:
# data type set
obj = ['마더코드', '상품코드', 'week', 'hour', 'prime', 'IsHoliday']

for col in obj:
    pf[col] = pf[col].astype('U')

In [20]:
# Encoding
dtypes = pf.dtypes
encoders = {}
for column in pf.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(pf[column])
        encoders[column] = encoder

_data = pf.copy()
for column in encoders.keys():
    encoder = encoders[column]
    _data[column] = encoder.transform(_data[column])

In [None]:
# scale dataset
scaler = MinMaxScaler()
cols = ['TEMP', 'HUM']
_data[cols] = scal

In [21]:
_data.head(2)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,time,hour,prime,real_date,설명,IsHoliday,지속휴일수,TEMP,HUM
0,0,65,296,891,1512,6,236,6664,0,1,14,17,0,0,6,1,1,599,3593
1,0,65,296,898,1518,6,236,15883,0,1,14,17,0,0,6,1,1,599,3593


In [22]:
_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35284 entries, 0 to 35283
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   방송일시       35284 non-null  int64
 1   노출(분)      35284 non-null  int64
 2   마더코드       35284 non-null  int64
 3   상품코드       35284 non-null  int64
 4   상품명        35284 non-null  int64
 5   상품군        35284 non-null  int64
 6   판매단가       35284 non-null  int64
 7   취급액        35284 non-null  int64
 8   date       35284 non-null  int64
 9   week       35284 non-null  int64
 10  time       35284 non-null  int64
 11  hour       35284 non-null  int64
 12  prime      35284 non-null  int64
 13  real_date  35284 non-null  int64
 14  설명         35284 non-null  int64
 15  IsHoliday  35284 non-null  int64
 16  지속휴일수      35284 non-null  int64
 17  TEMP       35284 non-null  int64
 18  HUM        35284 non-null  int64
dtypes: int64(19)
memory usage: 5.1 MB


## DNN Regression

In [25]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Embedding, LSTM, concatenate
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import RMSprop
import tensorflow.keras.backend as K
print(tf.__version__)

2.0.0-alpha0


In [24]:
X = _data.drop(['취급액'], axis = 1)
Y = _data[['취급액']]
print("data shape : ", X.shape)
print("label shape : " , Y.shape)

data shape :  (35284, 18)
label shape :  (35284, 1)


In [29]:
K.clear_session()
model = Sequential()

model.add(Dense(128, input_shape = (18, ), activation = 'relu'))
model.add(Dense(64, activation = 'relu'))

model.add(Dropout(0.1))
model.add(Dense(1))

model.compile(optimizer = RMSprop(),
             loss = 'mse',
             metrics = ['mse'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               2432      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 10,753
Trainable params: 10,753
Non-trainable params: 0
_________________________________________________________________


In [31]:
# fit model
hist = model.fit(X, Y, epochs = 10, verbose = 1, validation_split = 0.1)

Train on 31755 samples, validate on 3529 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
