In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb




In [2]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [3]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(int),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("./train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("./test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("./store.csv")

Load the training, test and store data using pandas


In [4]:
print("Assume store open, if not provided")
test.fillna(1, inplace=True)

Assume store open, if not provided


In [5]:
print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero")
train = train[train["Sales"] > 0]

Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero


In [6]:
print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

Join with store


In [7]:
features = []

In [8]:
print("augment features")
train = build_features(features, train)
test = build_features([], test)
print(features)


augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']


In [9]:
type(features)

list

In [10]:
print('training data processed')

def rmspe(y, yhat):
    return np.sqrt(np.mean(((y - yhat)/y) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y, yhat)

training data processed


In [11]:
print("Train xgboost model")

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.1,
          "max_depth": 10,
          "subsample": 0.85,
          "colsample_bytree": 0.4,
          "min_child_weight": 6,
          "silent": 1,
          "thread": 1,ni
          "seed": 42
          }
num_boost_round = 1000

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

Train xgboost model
Train a XGBoost model


In [12]:
X_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
167746,224,5,2014-03-01,6561,526,1,0,0,0,4,...,2013.0,"Jan,Apr,Jul,Oct",2014,3,1,9,24171.0,14.0,Mar,0
543533,721,4,2015-02-13,8397,863,1,0,0,0,1,...,2012.0,"Mar,Jun,Sept,Dec",2015,2,13,7,29.0,32.25,Feb,0
610175,808,2,2014-02-19,7571,545,1,1,0,1,1,...,2009.0,"Feb,May,Aug,Nov",2014,2,19,8,24170.0,54.25,Feb,1
167035,223,0,2013-12-09,6974,779,1,0,0,0,4,...,2011.0,"Jan,Apr,Jul,Oct",2013,12,9,50,218.0,29.75,Dec,0
164290,220,1,2015-04-07,4941,676,1,0,0,1,1,...,0.0,,2015,4,7,15,79.0,0.0,Apr,0


In [13]:
X_train[features].head()

Unnamed: 0,Store,CompetitionDistance,Promo,Promo2,SchoolHoliday,StoreType,Assortment,StateHoliday,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
167746,224,7930.0,0,1,0,4,3,0,5,3,1,2014,9,24171.0,14.0,0
543533,721,3590.0,0,1,0,1,3,0,4,2,13,2015,7,29.0,32.25,0
610175,808,18620.0,1,1,1,1,1,0,2,2,19,2014,8,24170.0,54.25,1
167035,223,2920.0,0,1,0,4,3,0,0,12,9,2013,50,218.0,29.75,0
164290,220,1000.0,0,0,1,1,1,0,1,4,7,2015,15,79.0,0.0,0


In [14]:
train

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,4,2015-07-31,5263,555,1,1,0,1,3,...,0.0,,2015,7,31,31,82.0,0.0,Jul,0
1,1,3,2015-07-30,5020,546,1,1,0,1,3,...,0.0,,2015,7,30,31,82.0,0.0,Jul,0
2,1,2,2015-07-29,4782,523,1,1,0,1,3,...,0.0,,2015,7,29,31,82.0,0.0,Jul,0
3,1,1,2015-07-28,5011,560,1,1,0,1,3,...,0.0,,2015,7,28,31,82.0,0.0,Jul,0
4,1,0,2015-07-27,6102,612,1,1,0,1,3,...,0.0,,2015,7,27,31,82.0,0.0,Jul,0
5,1,5,2015-07-25,4364,500,1,0,0,0,3,...,0.0,,2015,7,25,30,82.0,0.0,Jul,0
6,1,4,2015-07-24,3706,459,1,0,0,0,3,...,0.0,,2015,7,24,30,82.0,0.0,Jul,0
7,1,3,2015-07-23,3769,503,1,0,0,0,3,...,0.0,,2015,7,23,30,82.0,0.0,Jul,0
8,1,2,2015-07-22,3464,463,1,0,0,0,3,...,0.0,,2015,7,22,30,82.0,0.0,Jul,0
9,1,1,2015-07-21,3558,469,1,0,0,0,3,...,0.0,,2015,7,21,30,82.0,0.0,Jul,0


In [15]:
features

['Store',
 'CompetitionDistance',
 'Promo',
 'Promo2',
 'SchoolHoliday',
 'StoreType',
 'Assortment',
 'StateHoliday',
 'DayOfWeek',
 'Month',
 'Day',
 'Year',
 'WeekOfYear',
 'CompetitionOpen',
 'PromoOpen',
 'IsPromoMonth']

In [16]:
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=200, \
  feval=rmspe_xg, verbose_eval=True)


[0]	train-rmse:7.44378	eval-rmse:7.44274	train-rmspe:0.999525	eval-rmspe:0.999524
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 200 rounds.
[1]	train-rmse:6.70146	eval-rmse:6.70047	train-rmspe:0.998816	eval-rmspe:0.998814
[2]	train-rmse:6.03373	eval-rmse:6.03293	train-rmspe:0.997531	eval-rmspe:0.997528
[3]	train-rmse:5.4327	eval-rmse:5.43216	train-rmspe:0.995367	eval-rmspe:0.995362
[4]	train-rmse:4.89204	eval-rmse:4.89181	train-rmspe:0.991934	eval-rmspe:0.991928
[5]	train-rmse:4.40584	eval-rmse:4.4058	train-rmspe:0.986785	eval-rmspe:0.986779
[6]	train-rmse:3.96855	eval-rmse:3.9687	train-rmspe:0.979454	eval-rmspe:0.979446
[7]	train-rmse:3.57513	eval-rmse:3.57557	train-rmspe:0.969505	eval-rmspe:0.969498
[8]	train-rmse:3.22056	eval-rmse:3.22117	train-rmspe:0.956675	eval-rmspe:0.956663
[9]	train-rmse:2.90251	eval-rmse:2.90336	train-rmspe:0.940436	eval-rmspe:0.940426
[10]	train-rmse:2.61676	eval-rmse:2.6

[96]	train-rmse:0.187056	eval-rmse:0.189024	train-rmspe:0.250282	eval-rmspe:0.204896
[97]	train-rmse:0.186521	eval-rmse:0.188416	train-rmspe:0.249891	eval-rmspe:0.204287
[98]	train-rmse:0.186164	eval-rmse:0.188099	train-rmspe:0.249296	eval-rmspe:0.203836
[99]	train-rmse:0.184643	eval-rmse:0.186606	train-rmspe:0.247995	eval-rmspe:0.202323
[100]	train-rmse:0.18333	eval-rmse:0.185342	train-rmspe:0.246341	eval-rmspe:0.200917
[101]	train-rmse:0.18312	eval-rmse:0.185182	train-rmspe:0.246085	eval-rmspe:0.200741
[102]	train-rmse:0.181612	eval-rmse:0.183769	train-rmspe:0.244779	eval-rmspe:0.199268
[103]	train-rmse:0.181257	eval-rmse:0.183381	train-rmspe:0.244448	eval-rmspe:0.198871
[104]	train-rmse:0.181177	eval-rmse:0.183276	train-rmspe:0.244287	eval-rmspe:0.198688
[105]	train-rmse:0.179339	eval-rmse:0.18143	train-rmspe:0.242671	eval-rmspe:0.19664
[106]	train-rmse:0.178389	eval-rmse:0.18038	train-rmspe:0.24227	eval-rmspe:0.195405
[107]	train-rmse:0.177732	eval-rmse:0.179659	train-rmspe:0.24178

[192]	train-rmse:0.140873	eval-rmse:0.143617	train-rmspe:0.204489	eval-rmspe:0.154751
[193]	train-rmse:0.139981	eval-rmse:0.142699	train-rmspe:0.20385	eval-rmspe:0.153843
[194]	train-rmse:0.139844	eval-rmse:0.142561	train-rmspe:0.203677	eval-rmspe:0.153671
[195]	train-rmse:0.139036	eval-rmse:0.141799	train-rmspe:0.203003	eval-rmspe:0.152775
[196]	train-rmse:0.139017	eval-rmse:0.141774	train-rmspe:0.202969	eval-rmspe:0.152725
[197]	train-rmse:0.138292	eval-rmse:0.141152	train-rmspe:0.202363	eval-rmspe:0.152058
[198]	train-rmse:0.138195	eval-rmse:0.141062	train-rmspe:0.202292	eval-rmspe:0.151969
[199]	train-rmse:0.138023	eval-rmse:0.140879	train-rmspe:0.202354	eval-rmspe:0.151741
[200]	train-rmse:0.137832	eval-rmse:0.140682	train-rmspe:0.202168	eval-rmspe:0.151546
[201]	train-rmse:0.137012	eval-rmse:0.139808	train-rmspe:0.201264	eval-rmspe:0.150533
[202]	train-rmse:0.136956	eval-rmse:0.139759	train-rmspe:0.201255	eval-rmspe:0.15046
[203]	train-rmse:0.136946	eval-rmse:0.139747	train-rmspe

[288]	train-rmse:0.117999	eval-rmse:0.121211	train-rmspe:0.180434	eval-rmspe:0.129552
[289]	train-rmse:0.117919	eval-rmse:0.121113	train-rmspe:0.180258	eval-rmspe:0.129449
[290]	train-rmse:0.117753	eval-rmse:0.120941	train-rmspe:0.180105	eval-rmspe:0.1292
[291]	train-rmse:0.117454	eval-rmse:0.12066	train-rmspe:0.179892	eval-rmspe:0.128919
[292]	train-rmse:0.117416	eval-rmse:0.120626	train-rmspe:0.179853	eval-rmspe:0.128889
[293]	train-rmse:0.117281	eval-rmse:0.120496	train-rmspe:0.179751	eval-rmspe:0.128772
[294]	train-rmse:0.117067	eval-rmse:0.120253	train-rmspe:0.179609	eval-rmspe:0.128472
[295]	train-rmse:0.116859	eval-rmse:0.12004	train-rmspe:0.179462	eval-rmspe:0.128224
[296]	train-rmse:0.116741	eval-rmse:0.119951	train-rmspe:0.179373	eval-rmspe:0.128146
[297]	train-rmse:0.116487	eval-rmse:0.119716	train-rmspe:0.179184	eval-rmspe:0.127875
[298]	train-rmse:0.116373	eval-rmse:0.119622	train-rmspe:0.179103	eval-rmspe:0.127772
[299]	train-rmse:0.116275	eval-rmse:0.119548	train-rmspe:0

[384]	train-rmse:0.107125	eval-rmse:0.110921	train-rmspe:0.163073	eval-rmspe:0.117949
[385]	train-rmse:0.107056	eval-rmse:0.110868	train-rmspe:0.162999	eval-rmspe:0.117899
[386]	train-rmse:0.106876	eval-rmse:0.110679	train-rmspe:0.162857	eval-rmspe:0.117711
[387]	train-rmse:0.106853	eval-rmse:0.110665	train-rmspe:0.162834	eval-rmspe:0.117695
[388]	train-rmse:0.106821	eval-rmse:0.110647	train-rmspe:0.162934	eval-rmspe:0.117677
[389]	train-rmse:0.106683	eval-rmse:0.110525	train-rmspe:0.162831	eval-rmspe:0.117547
[390]	train-rmse:0.106579	eval-rmse:0.110418	train-rmspe:0.162755	eval-rmspe:0.117422
[391]	train-rmse:0.106481	eval-rmse:0.110332	train-rmspe:0.162679	eval-rmspe:0.117331
[392]	train-rmse:0.106311	eval-rmse:0.110145	train-rmspe:0.162537	eval-rmspe:0.117101
[393]	train-rmse:0.106001	eval-rmse:0.109806	train-rmspe:0.162181	eval-rmspe:0.116695
[394]	train-rmse:0.105791	eval-rmse:0.109611	train-rmspe:0.162016	eval-rmspe:0.116489
[395]	train-rmse:0.105606	eval-rmse:0.109422	train-rms

[480]	train-rmse:0.100815	eval-rmse:0.105491	train-rmspe:0.156239	eval-rmspe:0.111801
[481]	train-rmse:0.100812	eval-rmse:0.105489	train-rmspe:0.156226	eval-rmspe:0.111803
[482]	train-rmse:0.100739	eval-rmse:0.105441	train-rmspe:0.156182	eval-rmspe:0.11175
[483]	train-rmse:0.100697	eval-rmse:0.105423	train-rmspe:0.156111	eval-rmspe:0.111716
[484]	train-rmse:0.100658	eval-rmse:0.105411	train-rmspe:0.156069	eval-rmspe:0.111703
[485]	train-rmse:0.100471	eval-rmse:0.105244	train-rmspe:0.155933	eval-rmspe:0.111543
[486]	train-rmse:0.10035	eval-rmse:0.105143	train-rmspe:0.155854	eval-rmspe:0.111434
[487]	train-rmse:0.10032	eval-rmse:0.10513	train-rmspe:0.155762	eval-rmspe:0.111408
[488]	train-rmse:0.100276	eval-rmse:0.1051	train-rmspe:0.155732	eval-rmspe:0.111361
[489]	train-rmse:0.100256	eval-rmse:0.105078	train-rmspe:0.155716	eval-rmspe:0.111337
[490]	train-rmse:0.100196	eval-rmse:0.105023	train-rmspe:0.155588	eval-rmspe:0.111272
[491]	train-rmse:0.100186	eval-rmse:0.105025	train-rmspe:0.1

[576]	train-rmse:0.095903	eval-rmse:0.101513	train-rmspe:0.147066	eval-rmspe:0.1073
[577]	train-rmse:0.095802	eval-rmse:0.101447	train-rmspe:0.146913	eval-rmspe:0.107221
[578]	train-rmse:0.095765	eval-rmse:0.101428	train-rmspe:0.146875	eval-rmspe:0.107205
[579]	train-rmse:0.095696	eval-rmse:0.101366	train-rmspe:0.146249	eval-rmspe:0.107115
[580]	train-rmse:0.095683	eval-rmse:0.101359	train-rmspe:0.146236	eval-rmspe:0.107112
[581]	train-rmse:0.095677	eval-rmse:0.101361	train-rmspe:0.146235	eval-rmspe:0.107115
[582]	train-rmse:0.095612	eval-rmse:0.10133	train-rmspe:0.146276	eval-rmspe:0.107062
[583]	train-rmse:0.095594	eval-rmse:0.101312	train-rmspe:0.14624	eval-rmspe:0.107028
[584]	train-rmse:0.09557	eval-rmse:0.101303	train-rmspe:0.146185	eval-rmspe:0.107015
[585]	train-rmse:0.09556	eval-rmse:0.101299	train-rmspe:0.146174	eval-rmspe:0.107011
[586]	train-rmse:0.095489	eval-rmse:0.101222	train-rmspe:0.144932	eval-rmspe:0.106917
[587]	train-rmse:0.095477	eval-rmse:0.10122	train-rmspe:0.14

[672]	train-rmse:0.091865	eval-rmse:0.098168	train-rmspe:0.137229	eval-rmspe:0.103497
[673]	train-rmse:0.091805	eval-rmse:0.098085	train-rmspe:0.137185	eval-rmspe:0.103403
[674]	train-rmse:0.091798	eval-rmse:0.098083	train-rmspe:0.137169	eval-rmspe:0.103395
[675]	train-rmse:0.091776	eval-rmse:0.098077	train-rmspe:0.137139	eval-rmspe:0.103387
[676]	train-rmse:0.091768	eval-rmse:0.098068	train-rmspe:0.137133	eval-rmspe:0.103378
[677]	train-rmse:0.091631	eval-rmse:0.097907	train-rmspe:0.13703	eval-rmspe:0.103178
[678]	train-rmse:0.091614	eval-rmse:0.097894	train-rmspe:0.137003	eval-rmspe:0.103162
[679]	train-rmse:0.0916	eval-rmse:0.097881	train-rmspe:0.136998	eval-rmspe:0.103146
[680]	train-rmse:0.091591	eval-rmse:0.097876	train-rmspe:0.136992	eval-rmspe:0.103142
[681]	train-rmse:0.091582	eval-rmse:0.097864	train-rmspe:0.13698	eval-rmspe:0.103129
[682]	train-rmse:0.091519	eval-rmse:0.097803	train-rmspe:0.136636	eval-rmspe:0.10305
[683]	train-rmse:0.091487	eval-rmse:0.097785	train-rmspe:0.

[768]	train-rmse:0.088867	eval-rmse:0.095759	train-rmspe:0.131229	eval-rmspe:0.100867
[769]	train-rmse:0.088804	eval-rmse:0.0957	train-rmspe:0.131181	eval-rmspe:0.100808
[770]	train-rmse:0.088782	eval-rmse:0.095688	train-rmspe:0.131166	eval-rmspe:0.100799
[771]	train-rmse:0.088768	eval-rmse:0.095674	train-rmspe:0.13115	eval-rmspe:0.100784
[772]	train-rmse:0.088741	eval-rmse:0.095658	train-rmspe:0.131127	eval-rmspe:0.100766
[773]	train-rmse:0.088736	eval-rmse:0.09565	train-rmspe:0.13113	eval-rmspe:0.100761
[774]	train-rmse:0.088711	eval-rmse:0.095626	train-rmspe:0.131085	eval-rmspe:0.100737
[775]	train-rmse:0.088681	eval-rmse:0.095608	train-rmspe:0.130955	eval-rmspe:0.10072
[776]	train-rmse:0.088643	eval-rmse:0.09559	train-rmspe:0.130892	eval-rmspe:0.1007
[777]	train-rmse:0.088627	eval-rmse:0.095587	train-rmspe:0.130751	eval-rmspe:0.100698
[778]	train-rmse:0.088589	eval-rmse:0.095557	train-rmspe:0.130163	eval-rmspe:0.100678
[779]	train-rmse:0.088565	eval-rmse:0.095531	train-rmspe:0.1301

[864]	train-rmse:0.086244	eval-rmse:0.093918	train-rmspe:0.127862	eval-rmspe:0.098901
[865]	train-rmse:0.086174	eval-rmse:0.093861	train-rmspe:0.127673	eval-rmspe:0.098826
[866]	train-rmse:0.086162	eval-rmse:0.09385	train-rmspe:0.127651	eval-rmspe:0.098814
[867]	train-rmse:0.086157	eval-rmse:0.093842	train-rmspe:0.12762	eval-rmspe:0.098804
[868]	train-rmse:0.086135	eval-rmse:0.093829	train-rmspe:0.127591	eval-rmspe:0.098791
[869]	train-rmse:0.08603	eval-rmse:0.093735	train-rmspe:0.127431	eval-rmspe:0.098694
[870]	train-rmse:0.086022	eval-rmse:0.093729	train-rmspe:0.127424	eval-rmspe:0.098688
[871]	train-rmse:0.086018	eval-rmse:0.093724	train-rmspe:0.127424	eval-rmspe:0.098685
[872]	train-rmse:0.086013	eval-rmse:0.093716	train-rmspe:0.127439	eval-rmspe:0.098676
[873]	train-rmse:0.086001	eval-rmse:0.093721	train-rmspe:0.127414	eval-rmspe:0.098683
[874]	train-rmse:0.085968	eval-rmse:0.093697	train-rmspe:0.127383	eval-rmspe:0.098661
[875]	train-rmse:0.08594	eval-rmse:0.093686	train-rmspe:0

[960]	train-rmse:0.083775	eval-rmse:0.092231	train-rmspe:0.121519	eval-rmspe:0.097159
[961]	train-rmse:0.083714	eval-rmse:0.092172	train-rmspe:0.121472	eval-rmspe:0.097094
[962]	train-rmse:0.083709	eval-rmse:0.092166	train-rmspe:0.121447	eval-rmspe:0.097092
[963]	train-rmse:0.083695	eval-rmse:0.092164	train-rmspe:0.121434	eval-rmspe:0.097087
[964]	train-rmse:0.083686	eval-rmse:0.092153	train-rmspe:0.121417	eval-rmspe:0.097076
[965]	train-rmse:0.083655	eval-rmse:0.092133	train-rmspe:0.121391	eval-rmspe:0.097056
[966]	train-rmse:0.083625	eval-rmse:0.092103	train-rmspe:0.12137	eval-rmspe:0.097018
[967]	train-rmse:0.083617	eval-rmse:0.092102	train-rmspe:0.121395	eval-rmspe:0.097015
[968]	train-rmse:0.083611	eval-rmse:0.092101	train-rmspe:0.121389	eval-rmspe:0.097014
[969]	train-rmse:0.083594	eval-rmse:0.092097	train-rmspe:0.121334	eval-rmspe:0.097011
[970]	train-rmse:0.083552	eval-rmse:0.092086	train-rmspe:0.12126	eval-rmspe:0.097004
[971]	train-rmse:0.083525	eval-rmse:0.092085	train-rmspe

In [17]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)

Validating
RMSPE: 0.096743
Make predictions on the test set


In [18]:
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_submission.csv", index=False)