参考文献1：[Cheng Guo](https://arxiv.org/pdf/1604.06737.pdf)

# 训练entity embedding模型

## 0. 准备工作
### 0.1 调入库函数

In [1]:
#基本计算类
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

#可视化
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display 

#机器学习库函数
import sklearn
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers.core import Dense,Dropout,Activation,Reshape
from keras.layers import Merge
from keras.layers.embeddings import Embedding
import xgboost as xgb


#时间类
import time
import datetime
from isoweek import Week

#文件类
import os

#其他
import itertools
import operator

#基本设定
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
%matplotlib inline
sns.set_style('whitegrid')

Using TensorFlow backend.


### 0.2 定义评估指标
本项目采用Kaggle比赛的评估指标：RMSPE（误差百分比的均方差），可表示为
$$
RMSPE= \sqrt{\frac{1}{n}\sum_{i=1}^{n}(\frac{y_i-\hat{y_i}}{y_i})^2}
$$
其中，任何当天销售额为0的数据在评估时将被忽略； $y_i$ 表示某药店在某天的实际销售额，而$\hat{y_i}$ 表示该药店在对应这一天的预测销售额。

### 0.3 Setting seed

In [2]:
seed=42

### 0.4 将处理好的数据从本地硬盘读入

In [3]:
file_train_store_raw_df='train_store_raw_df.pickle'
file_test_store_raw_df='test_store_raw_df.pickle'
file_feature='feature_x_list.pickle'
path='Capstone_Project_Rossman_Sales_Prediction_1'

train_store_raw_df=pd.read_pickle(os.path.join(path, file_train_store_raw_df))
test_store_raw_df=pd.read_pickle(os.path.join(path, file_test_store_raw_df))
feature_x_list=pd.read_pickle(os.path.join(path, file_feature)).tolist()

### 0.5 每个特征所对应的unique数值
- dict_feature_range：每个feature所对应的范围
- dict_feature_offset：每个feature最小值移到0所对应的offset

In [4]:
len(feature_x_list)

18

In [5]:
dict_feature_range={}
dict_feature_offset={}

for ii in feature_x_list:
    unique_list=train_store_raw_df[ii].unique()
    max_v=max(unique_list)
    min_v=min(unique_list)
    dict_feature_range[ii]=(int)(max_v-min_v+1)
    dict_feature_offset[ii]=(int)(min_v)
    print '{0: <30}'.format(ii),':','unique=','{0: <5}'.format(len(unique_list)),\
    ',max=','{0: <5}'.format((max_v)),',min=','{0: <5}'.format((min_v)), \
    ',range=','{0: <5}'.format((dict_feature_range[ii])),',offset=','{0: <5}'.format((dict_feature_offset[ii]))
    


#     print ii,':','unique=',len(unique_list),',max=',max_v,',min=',min_v, \
#     ',range=',dict_feature_range[ii],',offset=',dict_feature_offset[ii]

Store                          : unique= 1115  ,max= 1115  ,min= 1     ,range= 1115  ,offset= 1    
DayOfWeek                      : unique= 7     ,max= 7     ,min= 1     ,range= 7     ,offset= 1    
Year                           : unique= 3     ,max= 2015  ,min= 2013  ,range= 3     ,offset= 2013 
Month                          : unique= 12    ,max= 12    ,min= 1     ,range= 12    ,offset= 1    
Day                            : unique= 31    ,max= 31    ,min= 1     ,range= 31    ,offset= 1    
DayOfYear                      : unique= 365   ,max= 365   ,min= 1     ,range= 365   ,offset= 1    
StoreType_cat                  : unique= 4     ,max= 3     ,min= 0     ,range= 4     ,offset= 0    
Assortment_cat                 : unique= 3     ,max= 2     ,min= 0     ,range= 3     ,offset= 0    
StateHoliday_cat               : unique= 4     ,max= 3     ,min= 0     ,range= 4     ,offset= 0    
SchoolHoliday                  : unique= 2     ,max= 1     ,min= 0     ,range= 2     ,offset= 0    


## 1. 准备数据
### 1.1 将数据对应feature_x_list每列添加offset，将数据的最小值设为0

In [6]:
modified_train_store_raw_df=train_store_raw_df.copy()
modified_test_store_raw_df=test_store_raw_df.copy()
for col in feature_x_list:
    modified_train_store_raw_df[col]=modified_train_store_raw_df[col]-dict_feature_offset[col]
    modified_test_store_raw_df[col]=modified_test_store_raw_df[col]-dict_feature_offset[col]
    
for ii in feature_x_list:
    unique_list=modified_train_store_raw_df[ii].unique()
    max_v=max(unique_list)
    min_v=min(unique_list)
    print '{0: <30}'.format(ii),':','unique=','{0: <5}'.format(len(unique_list)),\
    ',max=','{0: <5}'.format((max_v)),',min=','{0: <5}'.format((min_v)), \
    ',range=','{0: <5}'.format((int)(max_v-min_v+1)),',offset=','{0: <5}'.format((int)(min_v))    

Store                          : unique= 1115  ,max= 1114  ,min= 0     ,range= 1115  ,offset= 0    
DayOfWeek                      : unique= 7     ,max= 6     ,min= 0     ,range= 7     ,offset= 0    
Year                           : unique= 3     ,max= 2     ,min= 0     ,range= 3     ,offset= 0    
Month                          : unique= 12    ,max= 11    ,min= 0     ,range= 12    ,offset= 0    
Day                            : unique= 31    ,max= 30    ,min= 0     ,range= 31    ,offset= 0    
DayOfYear                      : unique= 365   ,max= 364   ,min= 0     ,range= 365   ,offset= 0    
StoreType_cat                  : unique= 4     ,max= 3     ,min= 0     ,range= 4     ,offset= 0    
Assortment_cat                 : unique= 3     ,max= 2     ,min= 0     ,range= 3     ,offset= 0    
StateHoliday_cat               : unique= 4     ,max= 3     ,min= 0     ,range= 4     ,offset= 0    
SchoolHoliday                  : unique= 2     ,max= 1     ,min= 0     ,range= 2     ,offset= 0    


### 1.2 构造test_df数据、train_df数据，valid_df数据

In [61]:
mask_train=(modified_train_store_raw_df['Date']<'2015-06-15') &(modified_train_store_raw_df['Outlier_3']==False) \
            & (modified_train_store_raw_df['Open']==1) & (modified_train_store_raw_df['Sales']>0)
mask_valid=(modified_train_store_raw_df['Date']>='2015-06-15') & (modified_train_store_raw_df['Open']==1)\
            & (modified_train_store_raw_df['Sales']>0)&(modified_train_store_raw_df['Outlier_3']==False)

df_train=modified_train_store_raw_df.loc[mask_train,feature_x_list]
df_valid=modified_train_store_raw_df.loc[mask_valid,feature_x_list]
y_train_data=np.array(modified_train_store_raw_df.loc[mask_train,'Sales'])
y_valid_data=np.array(modified_train_store_raw_df.loc[mask_valid,'Sales'])

df_test=modified_test_store_raw_df.loc[modified_test_store_raw_df['Open']==1]

### 1.3 验证test数据,valid数据的feature是否在train数据中

In [62]:
for ii in feature_x_list:
    set_unique_train=set(df_train[ii].unique())
    set_unique_valid=set(df_valid[ii].unique())
    set_unique_test=set(df_test[ii].unique())
    
    if set_unique_train.issuperset(set_unique_valid) & set_unique_train.issuperset(set_unique_test):
        print '{0: <30}'.format(ii),'= 1'
    else:
        print '{0: <30}'.format(ii),'= 0'

Store                          = 1
DayOfWeek                      = 1
Year                           = 1
Month                          = 1
Day                            = 1
DayOfYear                      = 1
StoreType_cat                  = 1
Assortment_cat                 = 1
StateHoliday_cat               = 1
SchoolHoliday                  = 1
Promo                          = 1
Promo2                         = 1
InPromo2Today                  = 1
DaysCountSinceCompetition_log  = 1
InCompetition                  = 1
InCompetitionToday             = 1
CompetitionDistance_log        = 1
DaysCountSincePromo2_log       = 1


## 2. 搭建Embedding模型，并预测结果
### 2.1 构造embedding模型所需要的train, valid数据

In [63]:
x_train_data=[]
x_valid_data=[]
for ii in feature_x_list:
    x_train_data.append(np.array(df_train[ii]))
    x_valid_data.append(np.array(df_valid[ii]))  
    


### 2.2 搭建embedding模型
- 构造list models，将每个feature对应的embedding装进去
    - 输入range: dict_feature_range
    - 对应的embedding数目:calc_embedding_space

In [71]:
def calc_embedding_space(x):
    dict_map={
        1115:50,
        365:30,
        31:10,
        16:10,
        15:10,
        12:8,
        7:6,
        4:3,        
        3:2
    }
    return dict_map[x]


models=[]
for ii in feature_x_list:
    input_range=dict_feature_range[ii]
    
    if input_range ==2:
        temp_name='Dense_'+ii
        model= Sequential(name=temp_name)
        model.add(Dense(1, input_dim=1))
        models.append(model)
    else:    
        embedding_space=calc_embedding_space(input_range)        
        temp_name='Embedding_'+ii
        model=Sequential(name=temp_name)
        model.add(Embedding(input_range,embedding_space,input_length=1))
        model.add(Reshape((embedding_space, ), input_shape=(1,embedding_space)))
        models.append(model)

In [83]:
def create_embedding_model():
    dropout_rate=0.1

    embedding_model=Sequential()
    embedding_model.add(Merge(models,mode='concat',concat_axis=-1))


    embedding_model.add(Dense(512,kernel_initializer='uniform'))
    embedding_model.add(Activation('relu'))
    embedding_model.add(Dense(128,kernel_initializer='uniform'))
    embedding_model.add(Activation('relu'))
    embedding_model.add(Dense(32,kernel_initializer='uniform'))
    embedding_model.add(Activation('relu'))
    embedding_model.add(Dropout(dropout_rate))
    embedding_model.add(Dense(1))
    embedding_model.add(Activation('sigmoid'))

    embedding_model.compile(loss='mean_absolute_error',optimizer='adam')
    return embedding_model

###  2.3 训练模型

In [84]:
max_log_y=np.max(np.log(y_train_data))

def _val_for_fit(val):
    val=np.log(val)
    return val/max_log_y
def _val_for_pred(val):
    return np.exp(val*max_log_y)

nb_epoch=10
for epoch_run in range(1,int(nb_epoch+1)):
    embedding_model=create_embedding_model()
    embedding_model.fit(x_train_data, _val_for_fit(y_train_data), 
                    validation_data=(x_valid_data,_val_for_fit(y_valid_data)),
#                     epochs=nb_epoch, 
                     epochs=epoch_run,
                        batch_size=256
                           # callbacks=[self.checkpointer],
                )

    
    modified_test_store_raw_df.loc[modified_test_store_raw_df.Open==0,'Sales']=0
    modified_test_store_raw_df['tuple_map']=modified_test_store_raw_df[feature_x_list].apply(tuple,axis=1)

    
    for index, row in modified_test_store_raw_df[modified_test_store_raw_df.Open==1].iterrows():
        input_value=(list(modified_test_store_raw_df.loc[index,'tuple_map']))
        t=[np.array([x]) for x in input_value]    
        modified_test_store_raw_df.loc[index,'Sales']=_val_for_pred(embedding_model.predict(t))[0][0]

    test_output=modified_test_store_raw_df[['Id','Sales']].copy()
    test_output.sort_values(by='Id',inplace=True)
    path='Capstone_Project_Rossman_Sales_Prediction_1'
    filename='{}epoch_{}drop_embedding_3Layers_run.csv'.format(epoch_run,int(dropout_rate*100))

    test_output.to_csv(os.path.join(path, filename),index=False)
    print filename+'------Done'

Train on 786180 samples, validate on 45177 samples
Epoch 1/1
1epoch_10drop_embedding_3Layers_run.csv------Done
Train on 786180 samples, validate on 45177 samples
Epoch 1/2
Epoch 2/2
2epoch_10drop_embedding_3Layers_run.csv------Done
Train on 786180 samples, validate on 45177 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
3epoch_10drop_embedding_3Layers_run.csv------Done
Train on 786180 samples, validate on 45177 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
4epoch_10drop_embedding_3Layers_run.csv------Done
Train on 786180 samples, validate on 45177 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
5epoch_10drop_embedding_3Layers_run.csv------Done
Train on 786180 samples, validate on 45177 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
6epoch_10drop_embedding_3Layers_run.csv------Done
Train on 786180 samples, validate on 45177 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
7epoch_10drop_embedding_3Layers_run.csv------Done
Train on 7861

### 2.4 在valid数据上评估模型RMSPE结果

### 2.5 用训练好的模型预测结果

In [75]:
start_time = time.time()
modified_test_store_raw_df.loc[modified_test_store_raw_df.Open==0,'Sales']=0
modified_test_store_raw_df['tuple_map']=modified_test_store_raw_df[feature_x_list].apply(tuple,axis=1)

for index, row in modified_test_store_raw_df[modified_test_store_raw_df.Open==1].iterrows():
    input_value=(list(modified_test_store_raw_df.loc[index,'tuple_map']))
    t=[np.array([x]) for x in input_value]    
    modified_test_store_raw_df.loc[index,'Sales']=_val_for_pred(embedding_model.predict(t))[0][0]

test_output=modified_test_store_raw_df[['Id','Sales']].copy()
test_output.sort_values(by='Id',inplace=True)
path='Capstone_Project_Rossman_Sales_Prediction_1'
filename='{}epoch_{}drop_embedding_3Layers_result.csv'.format(nb_epoch,int(dropout_rate*100))

test_output.to_csv(os.path.join(path, filename),index=False)
print("--- %s seconds ---" % (time.time() - start_time))

--- 118.676782846 seconds ---



# 训练XGBoost模型
Refers to [XGBoost Feature Importance](https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2)

Based on https://www.kaggle.com/justdoit/rossmann-store-sales/xgboost-in-python-with-rmspe/code

Public Score :  0.11389

Private Validation Score :  0.096959

## 3. 构建XGBoost模型，并预测结果
### 2.1 构造XGBoost模型所需要的train, valid数据

In [87]:
X_train,X_valid=df_train,df_valid
y_train=modified_train_store_raw_df.loc[mask_train,'SalesLog']
y_valid=modified_train_store_raw_df.loc[mask_valid,'SalesLog']

### 2.2 设置XGBoost模型的参数

In [92]:
print('training data processed')
params={
    'objective':'reg:linear',
    'booster':'gbtree',
    'eta':0.3,
    'max_depth':10,
    'subsample':0.9,
    'colsample_bytree':0.7,
    'silent':1,
    'seed':seed
}
num_boost_round=100

training data processed


### 2.3 训练模型

In [93]:
dtrain=xgb.DMatrix(X_train[feature_x_list],y_train)
dvalid=xgb.DMatrix(X_valid[feature_x_list],y_valid)

def rmspe(y,yhat):
    return np.sqrt(np.mean((yhat/y-1)**2))
def rmspe_xg(yhat,y):
    y=np.expm1(y.get_label())
    yhat=np.expm1(yhat)
    return 'rmspe',rmspe(y,yhat)

watchlist=[(dtrain,'train'),(dvalid,'eval')]
gbm=xgb.train(params,dtrain,num_boost_round,evals=watchlist,early_stopping_rounds=100,feval=rmspe_xg,verbose_eval=True)

Will train until eval error hasn't decreased in 100 rounds.
[0]	train-rmspe:0.996818	eval-rmspe:0.996931
[1]	train-rmspe:0.981394	eval-rmspe:0.982063
[2]	train-rmspe:0.937594	eval-rmspe:0.939122
[3]	train-rmspe:0.855673	eval-rmspe:0.857887
[4]	train-rmspe:0.742694	eval-rmspe:0.745082
[5]	train-rmspe:0.617960	eval-rmspe:0.618971
[6]	train-rmspe:0.503472	eval-rmspe:0.501832
[7]	train-rmspe:0.412915	eval-rmspe:0.408229
[8]	train-rmspe:0.348143	eval-rmspe:0.340473
[9]	train-rmspe:0.313813	eval-rmspe:0.303335
[10]	train-rmspe:0.290919	eval-rmspe:0.276755
[11]	train-rmspe:0.274935	eval-rmspe:0.258841
[12]	train-rmspe:0.273709	eval-rmspe:0.257537
[13]	train-rmspe:0.274886	eval-rmspe:0.257788
[14]	train-rmspe:0.275724	eval-rmspe:0.258048
[15]	train-rmspe:0.277641	eval-rmspe:0.259475
[16]	train-rmspe:0.278100	eval-rmspe:0.261810
[17]	train-rmspe:0.263861	eval-rmspe:0.247302
[18]	train-rmspe:0.259885	eval-rmspe:0.244582
[19]	train-rmspe:0.256477	eval-rmspe:0.242986
[20]	train-rmspe:0.255374	eval

### 2.4 利用训练好的模型进行预测

In [96]:
print('Validating')
yhat=gbm.predict(xgb.DMatrix(X_valid[feature_x_list]))
error=rmspe(y_valid,np.expm1(yhat))
print('RMSPE:{:.6f}'.format(error))


Validating
RMSPE:813.055656
