# 서강패럿(챔피언스리그, NS Shop+)

### 머신러닝 기법을 활용한 편성 실적 예측
- 시간 변수 전처리
- 데이터 인코딩 및 스케일링
- 메타 데이터 추가(날씨, CPI)
- 상품 필터링 로직

In [1]:
import numpy as np
import pandas as pd
import os
print("numpy version : ", np.__version__, ", pandas version : ", pd.__version__)

numpy version :  1.16.0 , pandas version :  1.1.1


## 전처리된 데이터 로드

1. 실적데이터(제공데이터)
    - 요일(week), 주차(weekofyear), 시간(hour)
    - 프라임타임(실적 기반, prime)
    - 공휴일 여부(IsHoliday), 지속휴일수
2. 날씨 데이터(날씨 API)
    - 기온(TEMP), 습도(HUM)  
3. 경제지표
    - CPI(소비자 물가지수)
4. 십진수로 Ordinal Encoded
    - obj = ['마더코드', '상품코드', '상품군', 'week', 'hour', 'weekofyear']
5. Binary encoded

In [2]:
# 데이터 저장과정에서 생길 수 있는 unnamed column 제거
def drop_unnamed(data):
    names = data.columns
    for name in names:
        if "Unnamed" in name:
            data.pop(name)
    return data

In [3]:
pf = drop_unnamed(pd.read_csv("~/bigcon2020_parrot/prep/data/final_performance_v4.csv"))
pf.head(2)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
0,2019-01-01 06:00:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,2099000.0,2019-01-01 00:00:00,1,...,0,0,0,0,0,0,0,0,0,104.03
1,2019-01-01 06:00:00,20.0,296,898,테이트 여성 셀린니트3종,6,39900,4371000.0,2019-01-01 00:00:00,1,...,0,0,0,0,0,0,0,0,0,104.03


In [4]:
len(pf)

35379

## 방송 초/중/후반 정보 삽입

In [5]:
pf['cumcount'] = pf.groupby(['date', '상품코드'])['노출(분)'].cumcount()+1
pf = pf.merge(pf.groupby(['date', '상품코드'])['노출(분)'].count().reset_index().rename({'노출(분)' : 'count'}, axis = 1),
              on = ['date', '상품코드'])
pf['p'] = pf['cumcount']/pf['count']

In [6]:
def stage(x):
    if x < 0.34:
        return 0 # early
    elif 0.34 <= x < 0.67:
        return 1 # mid
    return 2 # late

def stage_advanced(x):
    if x['count'] == 1:
        return -1 # short
    if x['count'] == 2:
        if x['p'] <= 0.5:
            return 0 # early
        else:
            return 2 # late
    else:
        return stage(x['p'])

In [7]:
pf['stage'] = pf.apply(stage_advanced, axis=1)
pf.head(1)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,date,week,...,w_0,w_1,w_2,w_3,w_4,w_5,cpi,cumcount,count,p
0,2019-01-01 06:00:00,20.0,296,891,테이트 남성 셀린니트3종,6,39900,2099000.0,2019-01-01 00:00:00,1,...,0,0,0,0,0,0,104.03,1,3,0.333333


In [8]:
del pf['cumcount']
del pf['count']
del pf['p']

## 데이터 인코딩(scikit learn)
1. Ordinal Encoing(sklearn.preprocessing.LabelEncoder)
2. Binary Encoding(User defined)

In [9]:
import sklearn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm.auto import tqdm

In [10]:
"""
object를 인코딩해주는데, timestamp와 같이 변환하면 안되는 컬럼들이 있으므로,
변환할 컬럼들을 지정해준다.
stage는 ordinal하게 만들게 되면, 알파벳 순으로 late가 mid보다 앞에 위치하므로
인코딩을 따로 진행해 준다.
"""
# data type set
obj = ['마더코드', '상품코드', '상품군', 'week', 'hour', 'weekofyear']

In [11]:
# # Encoding into DECIMAL values
# dtypes = pf.dtypes
# encoders = {}

# # 미리 지정한 obj 컬럼들에 대해서만 인코딩 진행
# for column in obj:
#     if str(dtypes[column]) == 'object':
#         encoder = LabelEncoder()
#         encoder.fit(pf[column])
#         encoders[column] = encoder

# _data = pf.copy()
# for column in encoders.keys():
#     encoder = encoders[column]
#     # 인코딩 정보 저장
#     np.save(column+'_classes.npy', encoder.classes_)
#     _data[column] = encoder.transform(pf[column])

In [12]:
print('### 바이너리 인코딩 시 최대 길이 ###')
pf.describe()[obj].loc['max'].apply(lambda x: len(bin(int(x)).replace("0b", "")))

### 바이너리 인코딩 시 최대 길이 ###


마더코드          10
상품코드          11
상품군            4
week           3
hour           5
weekofyear     6
Name: max, dtype: int64

In [13]:
# categorical variables to binary encoding
# bin(max(_data['상품명'])).replace("0b", '')

def binary_encoding(name, x):
    # x : column
    if name == '마더코드':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(10-len(value))+value))
    if name == '상품코드':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(11-len(value))+value))
    if name == 'week':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(3-len(value))+value))
    if name == 'hour':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(5-len(value))+value))
    if name == 'weekofyear':
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(6-len(value))+value))
    else: # 상품군
        value = bin(x).replace("0b", "")
        return list(map(int, '0'*(4-len(value))+value))

In [14]:
# apply on dataframe

def binary_execute(df):
    # 마더코드
    df[['m_'+str(i) for i in range(10)]] = df['마더코드'].apply(lambda x: binary_encoding('마더코드', x)).to_list()
    
    # 상품코드
    df[['p_'+str(i) for i in range(11)]] = df['상품코드'].apply(lambda x: binary_encoding('상품코드', x)).to_list()
    
    # 상품군
    df[['g_'+str(i) for i in range(4)]] = df['상품군'].apply(lambda x: binary_encoding('상품군', x)).to_list()
    
    # week
    df[['wd_'+str(i) for i in range(3)]] = df['week'].apply(lambda x: binary_encoding('week', x)).to_list()
    
    # weekofyear
    df[['wy_'+str(i) for i in range(6)]] = df['weekofyear'].apply(lambda x: binary_encoding('weekofyear', x)).to_list()
    
    # hour
    df[['h_'+str(i) for i in range(5)]] = df['hour'].apply(lambda x: binary_encoding('hour', x)).to_list()
    
    # 사용하지 않는 컬럼 삭제
    df = df.drop(['마더코드', '상품코드', '상품명', '상품군',
                  '방송일시', 'date', 'time', 'real_date'], axis = 1)
    return df

# final = binary_execute(_data)
# display(final.head(1))
# print("(bin)features : ", final.columns)
# print("(bin)num of features : ", len(final.columns))

In [15]:
from sklearn.model_selection import train_test_split
# 사용하지 않는 컬럼 삭제
pf = pf.drop(['마더코드', '상품코드', '상품명', '상품군',
              '방송일시', 'date', 'time', 'real_date'], axis = 1)
pf.head(2)

Unnamed: 0,노출(분),판매단가,취급액,week,hour,prime,IsHoliday,지속휴일수,TEMP,HUM,...,g_3,s_0,s_1,w_0,w_1,w_2,w_3,w_4,w_5,cpi
0,20.0,39900,2099000.0,1,6,0,1.0,1.0,-6.576974,63.524958,...,0,0,0,0,0,0,0,0,0,104.03
1,20.0,39900,3262000.0,1,6,0,1.0,1.0,-6.576974,63.524958,...,0,1,0,0,0,0,0,0,0,104.03


In [16]:
X = pf.drop(['취급액'], axis = 1)
Y = pf[['취급액']]

In [18]:
# without any scaling method
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=321)


print("x_train : ", x_train.shape)
print("y_train : ", y_train.shape)
print("x_valid : ", x_valid.shape)
print("y_valid : ", y_valid.shape)

x_train :  (28303, 45)
y_train :  (28303, 1)
x_valid :  (7076, 45)
y_valid :  (7076, 1)


## Data Scaling(not yet)

- Log transformation(np.log)
- Min-Max scaling\
...

In [19]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
# temp = _data[['weekofyear', 'TEMP', 'HUM']].groupby('weekofyear').sum().reset_index()
# plt.bar(temp['weekofyear'], temp['TEMP'])

In [22]:
# temp = _data[['weekofyear', 'TEMP', 'HUM']].groupby('weekofyear').sum().reset_index()
# plt.bar(temp['weekofyear'], np.log(temp['TEMP']))

In [24]:
# plt.bar(temp['weekofyear'], temp['HUM'], color = 'red')

In [25]:
# plt.bar(temp['weekofyear'], np.log(temp['HUM']), color = 'red')

In [26]:
# temp_2 = _data[['weekofyear', 'cpi']].groupby('weekofyear').sum().reset_index()
# plt.bar(temp_2['weekofyear'], temp_2['cpi'], color = 'green')