In [3]:
cd "/Users/2015/Dropbox/Data Science/NYC Data Science Academy/class-projects/Rossmann"

/Users/2015/Dropbox/Data Science/NYC Data Science Academy/class-projects/Rossmann


In [4]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
import xgboost as xgb

In [5]:
# Thanks to Chenglong Chen for providing this in the forum
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe


In [6]:
# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear'])

    # add some more with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)
    #
    #features.append('StateHoliday')
    #data.loc[data['StateHoliday'] == 'a', 'StateHoliday'] = '1'
    #data.loc[data['StateHoliday'] == 'b', 'StateHoliday'] = '2'
    #data.loc[data['StateHoliday'] == 'c', 'StateHoliday'] = '3'
    #data['StateHoliday'] = data['StateHoliday'].astype(float)

    features.append('DayOfWeek')
    features.append('month')
    features.append('day')
    features.append('year')
    data['year'] = data.Date.apply(lambda x: x.split('-')[0])
    data['year'] = data['year'].astype(float)
    data['month'] = data.Date.apply(lambda x: x.split('-')[1])
    data['month'] = data['month'].astype(float)
    data['day'] = data.Date.apply(lambda x: x.split('-')[2])
    data['day'] = data['day'].astype(float)

    features.append('StoreType')
    data.loc[data['StoreType'] == 'a', 'StoreType'] = '1'
    data.loc[data['StoreType'] == 'b', 'StoreType'] = '2'
    data.loc[data['StoreType'] == 'c', 'StoreType'] = '3'
    data.loc[data['StoreType'] == 'd', 'StoreType'] = '4'
    data['StoreType'] = data['StoreType'].astype(float)

    features.append('Assortment')
    data.loc[data['Assortment'] == 'a', 'Assortment'] = '1'
    data.loc[data['Assortment'] == 'b', 'Assortment'] = '2'
    data.loc[data['Assortment'] == 'c', 'Assortment'] = '3'
    data['Assortment'] = data['Assortment'].astype(float)

In [13]:
print("Load the training, test and store data using pandas")
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
store = pd.read_csv("data/store.csv")

Load the training, test and store data using pandas


In [16]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1,1,0,0
1,2,3,4,2015-09-17,1,1,0,0
2,3,7,4,2015-09-17,1,1,0,0
3,4,8,4,2015-09-17,1,1,0,0
4,5,9,4,2015-09-17,1,1,0,0


In [20]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270,9,2008,0,,,
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270,9,2008,0,,,
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270,9,2008,0,,,
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270,9,2008,0,,,
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270,9,2008,0,,,


In [10]:
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270,9,2008,0,,,
1,2,a,a,570,11,2007,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130,12,2006,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620,9,2009,0,,,
4,5,a,a,29910,4,2015,0,,,


In [17]:
print("Assume store open, if not provided")
test.fillna(1, inplace=True)

Assume store open, if not provided


In [18]:
print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]

Consider only open stores for training. Closed stores wont count into the score.


In [19]:
print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

Join with store


In [22]:
features = []

In [23]:
print("augment features")
build_features(features, train)

augment features


In [24]:
build_features([], test)
print(features)

['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'DayOfWeek', 'month', 'day', 'year', 'StoreType', 'Assortment']


In [25]:
params = {"objective": "reg:linear",
          "eta": 0.20,
          "max_depth": 8,
          "subsample": 0.8,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 700

In [26]:
print("Train a XGBoost model")
val_size = 100000
#train = train.sort(['Date'])
print(train.tail(1)['Date'])
X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
#X_train, X_test = train.head(len(train) - val_size), train.tail(val_size)
dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1))
dtest = xgb.DMatrix(test[features])
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=200, feval=rmspe_xg, verbose_eval=True)


Will train until train error hasn't decreased in 200 rounds.
[0]	eval-rmspe:0.998634	train-rmspe:0.998663
[1]	eval-rmspe:0.994605	train-rmspe:0.994640
[2]	eval-rmspe:0.984451	train-rmspe:0.984495
[3]	eval-rmspe:0.963912	train-rmspe:0.963985
[4]	eval-rmspe:0.929315	train-rmspe:0.929471
[5]	eval-rmspe:0.879489	train-rmspe:0.879799
[6]	eval-rmspe:0.815675	train-rmspe:0.816339
[7]	eval-rmspe:0.741940	train-rmspe:0.743107
[8]	eval-rmspe:0.663296	train-rmspe:0.665249
[9]	eval-rmspe:0.585750	train-rmspe:0.588657
[10]	eval-rmspe:0.513523	train-rmspe:0.517957
[11]	eval-rmspe:0.449290	train-rmspe:0.455334
[12]	eval-rmspe:0.396861	train-rmspe:0.405260
[13]	eval-rmspe:0.356037	train-rmspe:0.366731
[14]	eval-rmspe:0.326981	train-rmspe:0.339982
[15]	eval-rmspe:0.307110	train-rmspe:0.322057
[16]	eval-rmspe:0.295846	train-rmspe:0.312423
[17]	eval-rmspe:0.289115	train-rmspe:0.307047
[18]	eval-rmspe:0.285262	train-rmspe:0.303762
[19]	eval-rmspe:0.284062	train-rmspe:0.303726
[20]	eval-rmspe:0.285191	trai

Train a XGBoost model
844391    2013-01-02
Name: Date, dtype: object


[699]	eval-rmspe:0.125196	train-rmspe:0.134565
