In [1]:
'''
Based on https://www.kaggle.com/justdoit/rossmann-store-sales/xgboost-in-python-with-rmspe/code
Public Score :  0.11389
Private Validation Score :  0.096959
'''

'\nBased on https://www.kaggle.com/justdoit/rossmann-store-sales/xgboost-in-python-with-rmspe/code\nPublic Score :  0.11389\nPrivate Validation Score :  0.096959\n'

In [1]:

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt

In [2]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)


In [213]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly

    # add some more with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)

    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'month', 'day', 'year'])
    data['year'] = data.Date.dt.year
    data['month'] = data.Date.dt.month
    data['day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    
    features.extend(['competiter'])
    #competiter
    data['competiter'] = ((data['year'] > data['CompetitionOpenSinceYear']) | \
                          ((data['year'] == data['CompetitionOpenSinceYear']) & (data['month'] >=data['CompetitionOpenSinceMonth']))).astype('int')
        

In [314]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("data/store.csv")

Load the training, test and store data using pandas


In [315]:
# 計算平均人數和客單價
train['values']=train['Sales']/train['Customers']

store_mean=train.groupby('Store').mean()

In [316]:
store_mean['Customers_mean'] = store_mean['Customers']
store_mean['values_mean'] = store_mean['values']

In [317]:
store_mean['Store']=store_mean.index

In [318]:
store = pd.merge(store, store_mean[['Store','Customers_mean','values_mean']], on='Store')

In [319]:
train

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,values
0,1,5,2015-07-31,5263,555,1,1,0,1,9.482883
1,2,5,2015-07-31,6064,625,1,1,0,1,9.702400
2,3,5,2015-07-31,8314,821,1,1,0,1,10.126675
3,4,5,2015-07-31,13995,1498,1,1,0,1,9.342457
4,5,5,2015-07-31,4822,559,1,1,0,1,8.626118
5,6,5,2015-07-31,5651,589,1,1,0,1,9.594228
6,7,5,2015-07-31,15344,1414,1,1,0,1,10.851485
7,8,5,2015-07-31,8492,833,1,1,0,1,10.194478
8,9,5,2015-07-31,8565,687,1,1,0,1,12.467249
9,10,5,2015-07-31,7185,681,1,1,0,1,10.550661


In [320]:
store

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Customers_mean,values_mean
0,1,c,a,1270,9,2008,0,,,,467.646497,8.393038
1,2,a,a,570,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",486.045648,8.408443
2,3,a,a,14130,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",620.286624,9.117599
3,4,c,c,620,9,2009,0,,,,1100.057325,7.249827
4,5,a,a,29910,4,2015,0,,,,444.360934,8.611229
5,6,a,a,310,12,2013,0,,,,525.990446,8.634089
6,7,a,c,24000,4,2013,0,,,,791.474522,9.232635
7,8,a,a,7520,10,2014,0,,,,547.799363,8.290323
8,9,a,c,2030,8,2000,0,,,,479.487261,11.206413
9,10,a,a,3160,9,2009,0,,,,494.332272,9.331909


In [321]:
print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

Assume store open, if not provided


In [322]:
print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe


In [323]:
print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

Join with store


In [324]:
train

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,values,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Customers_mean,values_mean
0,1,5,2015-07-31,5263,555,1,1,0,1,9.482883,...,a,1270,9,2008,0,,,,467.646497,8.393038
1,1,4,2015-07-30,5020,546,1,1,0,1,9.194139,...,a,1270,9,2008,0,,,,467.646497,8.393038
2,1,3,2015-07-29,4782,523,1,1,0,1,9.143403,...,a,1270,9,2008,0,,,,467.646497,8.393038
3,1,2,2015-07-28,5011,560,1,1,0,1,8.948214,...,a,1270,9,2008,0,,,,467.646497,8.393038
4,1,1,2015-07-27,6102,612,1,1,0,1,9.970588,...,a,1270,9,2008,0,,,,467.646497,8.393038
5,1,6,2015-07-25,4364,500,1,0,0,0,8.728000,...,a,1270,9,2008,0,,,,467.646497,8.393038
6,1,5,2015-07-24,3706,459,1,0,0,0,8.074074,...,a,1270,9,2008,0,,,,467.646497,8.393038
7,1,4,2015-07-23,3769,503,1,0,0,0,7.493042,...,a,1270,9,2008,0,,,,467.646497,8.393038
8,1,3,2015-07-22,3464,463,1,0,0,0,7.481641,...,a,1270,9,2008,0,,,,467.646497,8.393038
9,1,2,2015-07-21,3558,469,1,0,0,0,7.586354,...,a,1270,9,2008,0,,,,467.646497,8.393038


In [325]:
features = []
features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek',
                     'Promo2SinceYear','Customers_mean'])

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

augment features
['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'Customers_mean', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'month', 'day', 'year', 'competiter']


In [326]:
train[train['CompetitionOpenSinceYear']==2015].sort(['Store','Date'])

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,values,...,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Customers_mean,values_mean,year,month,day,competiter
3906,5,2,2013-01-02,4253,577,1,0,0,1,7.370884,...,0,0,0,0,444.360934,8.611229,2013,1,2,0
3905,5,3,2013-01-03,3465,491,1,0,0,0,7.057026,...,0,0,0,0,444.360934,8.611229,2013,1,3,0
3904,5,4,2013-01-04,4456,533,1,0,0,0,8.360225,...,0,0,0,0,444.360934,8.611229,2013,1,4,0
3903,5,5,2013-01-05,1590,202,1,0,0,0,7.871287,...,0,0,0,0,444.360934,8.611229,2013,1,5,0
3902,5,0,2013-01-07,6978,717,1,1,0,0,9.732218,...,0,0,0,0,444.360934,8.611229,2013,1,7,0
3901,5,1,2013-01-08,5718,613,1,1,0,0,9.327896,...,0,0,0,0,444.360934,8.611229,2013,1,8,0
3900,5,2,2013-01-09,5974,697,1,1,0,0,8.571019,...,0,0,0,0,444.360934,8.611229,2013,1,9,0
3899,5,3,2013-01-10,4999,596,1,1,0,0,8.387584,...,0,0,0,0,444.360934,8.611229,2013,1,10,0
3898,5,4,2013-01-11,5159,562,1,1,0,0,9.179715,...,0,0,0,0,444.360934,8.611229,2013,1,11,0
3897,5,5,2013-01-12,1760,223,1,0,0,0,7.892377,...,0,0,0,0,444.360934,8.611229,2013,1,12,0


In [333]:
print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.09,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 500

training data processed


In [328]:
features=['Store',
 'CompetitionDistance',
 'Promo',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'SchoolHoliday',
 'StoreType',
 'Assortment',
 'StateHoliday',
 'DayOfWeek',
 'month',
 'day',
 'year',
 'competiter',
 'Customers_mean',
 'values_mean']

In [329]:
print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

Train a XGBoost model


In [334]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Will train until eval error hasn't decreased in 100 rounds.
[0]	train-rmspe:0.999985	eval-rmspe:0.999572
[1]	train-rmspe:0.999918	eval-rmspe:0.999015
[2]	train-rmspe:0.999679	eval-rmspe:0.998042
[3]	train-rmspe:0.998963	eval-rmspe:0.996460
[4]	train-rmspe:0.997164	eval-rmspe:0.994029
[5]	train-rmspe:0.993042	eval-rmspe:0.990466
[6]	train-rmspe:0.986588	eval-rmspe:0.985455
[7]	train-rmspe:0.977158	eval-rmspe:0.978689
[8]	train-rmspe:0.968983	eval-rmspe:0.969829
[9]	train-rmspe:0.959351	eval-rmspe:0.958625
[10]	train-rmspe:0.944806	eval-rmspe:0.944886
[11]	train-rmspe:0.928470	eval-rmspe:0.928380
[12]	train-rmspe:0.909280	eval-rmspe:0.909154
[13]	train-rmspe:0.887495	eval-rmspe:0.887287
[14]	train-rmspe:0.863079	eval-rmspe:0.862818
[15]	train-rmspe:0.836403	eval-rmspe:0.835991
[16]	train-rmspe:0.807621	eval-rmspe:0.807031
[17]	train-rmspe:0.777181	eval-rmspe:0.776396
[18]	train-rmspe:0.745334	eval-rmspe:0.744299
[19]	train-rmspe:0.712487	eval-rmspe:0.711127
[20]	train-rmspe:0.679097	eval

In [335]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

Validating
RMSPE: 0.093097


# submission

In [336]:
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)

# feature importance

In [337]:
create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)
