In [1]:
'''
Based on https://www.kaggle.com/justdoit/rossmann-store-sales/xgboost-in-python-with-rmspe/code
Public Score :  0.10826
Private Validation Score :  0.090811
'''

'\nBased on https://www.kaggle.com/justdoit/rossmann-store-sales/xgboost-in-python-with-rmspe/code\nPublic Score :  0.10826\nPrivate Validation Score :  0.090811\n'

In [2]:

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt

In [3]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)


In [152]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly

    # add some more with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)

    features.extend(['StoreType', 'Assortment', 'StateHoliday']) 
    #features.extend(['StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'month', 'day', 'year'])
    data['year'] = data.Date.dt.year
    data['month'] = data.Date.dt.month
    data['day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    
    features.extend(['competiter'])
    #competiter
    data['competiter'] = ((data['year'] > data['CompetitionOpenSinceYear']) | \
                          ((data['year'] == data['CompetitionOpenSinceYear']) & (data['month'] >=data['CompetitionOpenSinceMonth']))).astype('int')
    
    
    

In [143]:
# calulate statistical value 
def build_sts_features(train, store):
    
    train['values']=train['Sales']/train['Customers']
    store_mean=train.groupby('Store').mean()
    store_mean['Customers_mean'] = store_mean['Customers']
    store_mean['values_mean'] = store_mean['values']
    store_mean['Store']=store_mean.index
    
    store = pd.merge(store, store_mean[['Store','Customers_mean','values_mean']], on='Store')
    return store

In [126]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("data/store.csv")

Load the training, test and store data using pandas


In [127]:
store =build_sts_features(train, store)

In [82]:
store

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Customers_mean,values_mean
0,1,c,a,1270,9,2008,0,,,,467.646497,8.393038
1,2,a,a,570,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",486.045648,8.408443
2,3,a,a,14130,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",620.286624,9.117599
3,4,c,c,620,9,2009,0,,,,1100.057325,7.249827
4,5,a,a,29910,4,2015,0,,,,444.360934,8.611229
5,6,a,a,310,12,2013,0,,,,525.990446,8.634089
6,7,a,c,24000,4,2013,0,,,,791.474522,9.232635
7,8,a,a,7520,10,2014,0,,,,547.799363,8.290323
8,9,a,c,2030,8,2000,0,,,,479.487261,11.206413
9,10,a,a,3160,9,2009,0,,,,494.332272,9.331909


In [114]:
print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

Assume store open, if not provided


In [115]:
print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe


In [162]:
print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

train=train[train['Store']==1]

Join with store


In [163]:
features = []
features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek',
                     'Promo2SinceYear','Customers_mean','values_mean'])

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

augment features
['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'Customers_mean', 'values_mean', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'month', 'day', 'year', 'competiter']


In [164]:
print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.2,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 20

training data processed


In [165]:
'''
# for custom-feature
features=['Store',
 'CompetitionDistance',
 'Promo',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'SchoolHoliday',
 'StoreType',
 'Assortment',
 'StateHoliday',
 'DayOfWeek',
 'month',
 'day',
 'year',
 'competiter',
 'Customers_mean',
 'values_mean']
 '''

"\n# for custom-feature\nfeatures=['Store',\n 'CompetitionDistance',\n 'Promo',\n 'Promo2',\n 'Promo2SinceWeek',\n 'Promo2SinceYear',\n 'SchoolHoliday',\n 'StoreType',\n 'Assortment',\n 'StateHoliday',\n 'DayOfWeek',\n 'month',\n 'day',\n 'year',\n 'competiter',\n 'Customers_mean',\n 'values_mean']\n "

In [166]:
def train_valid_data(y, train, test_size=0.012, random_state=10):
    print("Train a XGBoost model")
    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    y_train = np.log1p(X_train[y])
    y_valid = np.log1p(X_valid[y])
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)
    return dtrain, dvalid

In [167]:
dtrain, dvalid = train_valid_data('Sales', train)

Train a XGBoost model


In [168]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Will train until eval error hasn't decreased in 100 rounds.
[0]	train-rmspe:0.998451	eval-rmspe:0.998501
[1]	train-rmspe:0.993920	eval-rmspe:0.994115
[2]	train-rmspe:0.982825	eval-rmspe:0.983375
[3]	train-rmspe:0.960993	eval-rmspe:0.962238
[4]	train-rmspe:0.925075	eval-rmspe:0.927457
[5]	train-rmspe:0.874175	eval-rmspe:0.877467
[6]	train-rmspe:0.809443	eval-rmspe:0.813519
[7]	train-rmspe:0.734552	eval-rmspe:0.739144
[8]	train-rmspe:0.654454	eval-rmspe:0.659401
[9]	train-rmspe:0.573782	eval-rmspe:0.579651
[10]	train-rmspe:0.496551	eval-rmspe:0.501682
[11]	train-rmspe:0.425469	eval-rmspe:0.429664
[12]	train-rmspe:0.362175	eval-rmspe:0.364167
[13]	train-rmspe:0.307103	eval-rmspe:0.308227
[14]	train-rmspe:0.260393	eval-rmspe:0.259570
[15]	train-rmspe:0.221436	eval-rmspe:0.217085
[16]	train-rmspe:0.190273	eval-rmspe:0.182992
[17]	train-rmspe:0.165729	eval-rmspe:0.158558
[18]	train-rmspe:0.147577	eval-rmspe:0.141244
[19]	train-rmspe:0.129214	eval-rmspe:0.125911


In [169]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

Validating
RMSPE: 0.407004


# submission

In [22]:
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)

# feature importance

In [337]:
create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)


# Split training data by store

In [170]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("data/store.csv")

store =build_sts_features(train, store)

print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("group data by storeType and assortment")
storeType=['a','b','c','d']
assortment=['a','c']

#train, test , store

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.2,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 20

for i in xrange(1,10):
        
    print('StoreType=%s' % (i))
    #split the store data
    store_s = store[(store['Store']==i)]
    #store_s = store
    #merge train and test data
    print("Join with store")
    train_s = pd.merge(train, store_s, on='Store')
    test_s = pd.merge(test, store_s, on='Store')
        
    #features
    features = []
    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                             'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek',
                             'Promo2SinceYear','Customers_mean','values_mean'])

    print("augment features")
    build_features(features, train_s)
    build_features([], test_s)
    print(features)
        
        
    #training data
    dtrain, dvalid = train_valid_data('Sales', train_s)
            
    #xgb model 
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
    early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=False)
        
    print("Validating")
    yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
    error = rmspe(X_valid.Sales.values, np.expm1(yhat))
    print('StoreType=%s RMSPE: {:.6f}'.format(error)) % (i)

Load the training, test and store data using pandas
Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
group data by storeType and assortment
StoreType=1
Join with store
augment features
['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'Customers_mean', 'values_mean', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'month', 'day', 'year', 'competiter']
Train a XGBoost model
Validating
StoreType=1 RMSPE: 0.407004
StoreType=2
Join with store
augment features
['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'Customers_mean', 'values_mean', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'month', 'day', 'year', 'competit