In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt


In [61]:
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}

### Reading the csv file

In [62]:
train = pd.read_csv("C:/Users/bbalakrishna/Downloads/rossmann-store-sales/train.csv", parse_dates=[2],dtype=types)

In [63]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1.0
1,2,5,2015-07-31,6064,625,1,1,0,1.0
2,3,5,2015-07-31,8314,821,1,1,0,1.0
3,4,5,2015-07-31,13995,1498,1,1,0,1.0
4,5,5,2015-07-31,4822,559,1,1,0,1.0


### Check if any null values are present in the data

In [64]:
print(train.apply(lambda x : sum(x.isnull())))

Exception ignored in: <function DMatrix.__del__ at 0x000002C35DB42E18>
Traceback (most recent call last):
  File "C:\Users\bbalakrishna\AppData\Local\Continuum\anaconda3\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64


In [38]:
train.StateHoliday.value_counts()

0    986159
a     20260
b      6690
c      4100
Name: StateHoliday, dtype: int64

#### StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. . a = public holiday, b = Easter holiday, c = Christmas, 0 = None

In [67]:
### machine learning algorithms only work with numbers so convert characters to numbers
mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
train.StateHoliday.replace(mappings, inplace=True)

In [68]:
## filling the null values with 0
train.fillna(0, inplace=True)

In [69]:
train['Year'] = train.Date.dt.year
train['Month'] = train.Date.dt.month
train['Day'] = train.Date.dt.day
train['DayOfWeek'] = train.Date.dt.dayofweek
train['WeekOfYear'] = train.Date.dt.weekofyear

In [70]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear
0,1,4,2015-07-31,5263,555,1,1,0,1.0,2015,7,31,31
1,2,4,2015-07-31,6064,625,1,1,0,1.0,2015,7,31,31
2,3,4,2015-07-31,8314,821,1,1,0,1.0,2015,7,31,31
3,4,4,2015-07-31,13995,1498,1,1,0,1.0,2015,7,31,31
4,5,4,2015-07-31,4822,559,1,1,0,1.0,2015,7,31,31


In [71]:
features = ['Year','Month','Day','DayOfWeek','WeekOfYear','StateHoliday']

### preparing the test data

In [74]:
test = pd.read_csv("C:/Users/bbalakrishna/Downloads/rossmann-store-sales/test.csv", parse_dates=[3],dtype=types)

In [75]:
### machine learning algorithms only work with numbers so convert characters to numbers
mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
test.StateHoliday.replace(mappings, inplace=True)
test.fillna(0, inplace=True)
test['Year'] = test.Date.dt.year
test['Month'] = test.Date.dt.month
test['Day'] = test.Date.dt.day
test['DayOfWeek'] = test.Date.dt.dayofweek
test['WeekOfYear'] = test.Date.dt.weekofyear

In [76]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear
0,1,1,3,2015-09-17,1.0,1,0,0.0,2015,9,17,38
1,2,3,3,2015-09-17,1.0,1,0,0.0,2015,9,17,38
2,3,7,3,2015-09-17,1.0,1,0,0.0,2015,9,17,38
3,4,8,3,2015-09-17,1.0,1,0,0.0,2015,9,17,38
4,5,9,3,2015-09-17,1.0,1,0,0.0,2015,9,17,38


In [77]:
test[features].head()

Unnamed: 0,Year,Month,Day,DayOfWeek,WeekOfYear,StateHoliday
0,2015,9,17,3,38,0
1,2015,9,17,3,38,0
2,2015,9,17,3,38,0
3,2015,9,17,3,38,0
4,2015,9,17,3,38,0


In [78]:
print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

training data processed


In [53]:
import xgboost as xgb

In [81]:
train[features].head()

Unnamed: 0,Year,Month,Day,DayOfWeek,WeekOfYear,StateHoliday
0,2015,7,31,4,31,0
1,2015,7,31,4,31,0
2,2015,7,31,4,31,0
3,2015,7,31,4,31,0
4,2015,7,31,4,31,0


#### for building the model we will only use features = ['Year','Month','Day','DayOfWeek','WeekOfYear','StateHoliday']

In [83]:

# X_train, X_test, y_train, y_test = train_test_split(
#...     X, y, test_size=0.33, random_state=42)
print("Train a XGBoost model")
X_train, X_valid,y_train, y_valid= train_test_split(train[features],train.Sales, test_size=0.012, random_state=10)

dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

Train a XGBoost model


In [84]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100,  verbose_eval=True)

[0]	train-rmse:5503.89	eval-rmse:5620.58
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
[1]	train-rmse:4507.16	eval-rmse:4625.07
[2]	train-rmse:3671.76	eval-rmse:3800.2
[3]	train-rmse:3173.55	eval-rmse:3308.45
[4]	train-rmse:2897.38	eval-rmse:3036.06
[5]	train-rmse:2748.45	eval-rmse:2889.85
[6]	train-rmse:2666.04	eval-rmse:2807.01
[7]	train-rmse:2618.2	eval-rmse:2758.79
[8]	train-rmse:2597.4	eval-rmse:2735.31
[9]	train-rmse:2574.87	eval-rmse:2710.87
[10]	train-rmse:2560.32	eval-rmse:2695.06
[11]	train-rmse:2554.44	eval-rmse:2688.68
[12]	train-rmse:2542.66	eval-rmse:2676.9
[13]	train-rmse:2541	eval-rmse:2674.72
[14]	train-rmse:2535.38	eval-rmse:2669.03
[15]	train-rmse:2533.2	eval-rmse:2666.42
[16]	train-rmse:2532.67	eval-rmse:2665.54
[17]	train-rmse:2530.45	eval-rmse:2662.49
[18]	train-rmse:2530.15	eval-rmse:2662.06
[19]	train-rmse:2529.16	eval-rmse:2660.89
[20]	train-rmse:2529.01	eval-rmse:

[191]	train-rmse:2524.42	eval-rmse:2655.76
[192]	train-rmse:2524.42	eval-rmse:2655.75
[193]	train-rmse:2524.42	eval-rmse:2655.74
[194]	train-rmse:2524.42	eval-rmse:2655.77
[195]	train-rmse:2524.42	eval-rmse:2655.69
[196]	train-rmse:2524.42	eval-rmse:2655.66
[197]	train-rmse:2524.42	eval-rmse:2655.72
[198]	train-rmse:2524.42	eval-rmse:2655.74
[199]	train-rmse:2524.42	eval-rmse:2655.71
[200]	train-rmse:2524.42	eval-rmse:2655.63
[201]	train-rmse:2524.42	eval-rmse:2655.65
[202]	train-rmse:2524.42	eval-rmse:2655.67
[203]	train-rmse:2524.42	eval-rmse:2655.66
[204]	train-rmse:2524.42	eval-rmse:2655.66
[205]	train-rmse:2524.42	eval-rmse:2655.67
[206]	train-rmse:2524.42	eval-rmse:2655.71
[207]	train-rmse:2524.42	eval-rmse:2655.74
[208]	train-rmse:2524.42	eval-rmse:2655.72
[209]	train-rmse:2524.42	eval-rmse:2655.73
[210]	train-rmse:2524.42	eval-rmse:2655.85
[211]	train-rmse:2524.42	eval-rmse:2655.89
[212]	train-rmse:2524.42	eval-rmse:2655.92
[213]	train-rmse:2524.42	eval-rmse:2655.92
[214]	train

In [86]:
test[features].head()

Unnamed: 0,Year,Month,Day,DayOfWeek,WeekOfYear,StateHoliday
0,2015,9,17,3,38,0
1,2015,9,17,3,38,0
2,2015,9,17,3,38,0
3,2015,9,17,3,38,0
4,2015,9,17,3,38,0


### making prediction for the hold out dataset

In [87]:
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)

##### Forecasted values for the test data

In [89]:
print(test_probs)

[6685.429  6685.429  6685.429  ... 6368.8457 6368.8457 6368.8457]


In [100]:
features = ['Year','Month','Day','DayOfWeek','WeekOfYear','StateHoliday','Sales']

In [102]:
train[features].head()

Unnamed: 0,Year,Month,Day,DayOfWeek,WeekOfYear,StateHoliday,Sales
0,2015,7,31,4,31,0,5263
1,2015,7,31,4,31,0,6064
2,2015,7,31,4,31,0,8314
3,2015,7,31,4,31,0,13995
4,2015,7,31,4,31,0,4822


In [104]:
train[features].to_csv('train.csv',index=False)


In [111]:
test[['Year','Month','Day','DayOfWeek','WeekOfYear','StateHoliday']].to_csv('test.csv',index=False)