Version 1.0.0, Python 2.7.14, conda 4.3.34

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
#import lightgbm as lgb
#from catboost import CatBoostRegressor
#from multiprocessing import *

#import sys
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
%matplotlib inline

for p in [np, pd, xgb]:
    print (p.__name__, p.__version__)

('numpy', '1.14.2')
('pandas', u'0.22.0')
('xgboost', '0.7.post3')


Load the data:

In [2]:
from prepare_data import read_data
sales, items, categories, shops, test = read_data()
print(test.shape)

(214200, 3)


Create a data frame with all the informations given:

In [3]:
#from prepare_data import create_df
#df = create_df(sales, items, categories, shops)

Use the informations given in the sales:

In [4]:
from prepare_data import make_grid, get_aggregated, join_to_existing

index_cols = ['shop_id', 'item_id', 'date_block_num']

grid = make_grid(sales, index_cols)
gb = get_aggregated(sales, index_cols)
all_data = join_to_existing(grid, gb, index_cols)

In [5]:
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data = all_data.astype('int32')

In [6]:
all_data = all_data.assign(month = (all_data.date_block_num%12)+1).astype('int32')

all_data = all_data.assign(year = 2013)
all_data['year'] = np.where(all_data.date_block_num < 12, 2013, 2014)
all_data['year'] = np.where(all_data.date_block_num > 23, 2015, all_data.year).astype('int32')

all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,month,year
139255,0,19,0,0,1,2013
141495,0,27,0,0,1,2013
144968,0,28,0,0,1,2013
142661,0,29,0,0,1,2013
138947,0,32,0,6,1,2013


In [7]:
target_mean = all_data[['shop_id','item_id','target']].groupby(['shop_id','item_id'], as_index=False)[['target']].mean()
target_mean = target_mean.rename(columns={'target':'target_mean'}).astype('float32')
print(target_mean.shape)
index_cols = ['shop_id', 'item_id']
all_data = join_to_existing(all_data, target_mean, index_cols)

(1136205, 3)


In [8]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913850 entries, 0 to 10913849
Data columns (total 7 columns):
shop_id           int64
item_id           int64
date_block_num    int32
target            int32
month             int32
year              int32
target_mean       float32
dtypes: float32(1), int32(4), int64(2)
memory usage: 458.0 MB


In [9]:
from prepare_data import k_fold_mean, loo_mean, smoothing_mean, expanding_mean

all_data, target_kf  = k_fold_mean(all_data)
all_data, target_loo = loo_mean(all_data)
all_data, target_sm = smoothing_mean(all_data)
all_data, target_exp = expanding_mean(all_data)
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,month,year,target_mean,target_kfold,target_loo,target_smoothing,target_exp_mean
0,0.0,19.0,0.0,0.0,1.0,2013.0,0.0,0.3343,0.022727,0.237448,0.3343
1,0.0,27.0,0.0,0.0,1.0,2013.0,0.0,0.048523,0.056911,0.089905,0.3343
2,0.0,28.0,0.0,0.0,1.0,2013.0,0.0,0.142424,0.141414,0.168964,0.3343
3,0.0,29.0,0.0,0.0,1.0,2013.0,0.0,0.030303,0.0375,0.10791,0.3343
4,0.0,32.0,0.0,6.0,1.0,2013.0,8.0,0.89402,1.316088,1.260635,0.3343


In [10]:
y = all_data.target.astype('int32')
X = all_data.drop(columns=['target'])
print(y.shape)
print(X.shape)
X.head()

(10913850,)
(10913850, 10)


Unnamed: 0,shop_id,item_id,date_block_num,month,year,target_mean,target_kfold,target_loo,target_smoothing,target_exp_mean
0,0.0,19.0,0.0,1.0,2013.0,0.0,0.3343,0.022727,0.237448,0.3343
1,0.0,27.0,0.0,1.0,2013.0,0.0,0.048523,0.056911,0.089905,0.3343
2,0.0,28.0,0.0,1.0,2013.0,0.0,0.142424,0.141414,0.168964,0.3343
3,0.0,29.0,0.0,1.0,2013.0,0.0,0.030303,0.0375,0.10791,0.3343
4,0.0,32.0,0.0,1.0,2013.0,8.0,0.89402,1.316088,1.260635,0.3343


The test data must have the same features:

In [11]:
from prepare_data import prepare_test
X_test = prepare_test(test)

X_test = join_to_existing(X_test, target_mean, index_cols)
X_kf = X_test.merge(target_kf, on=index_cols)
index = ['shop_id', 'item_id', 'date_block_num', 'month', 'year']
X_kf = X_kf.groupby(index, as_index=False)[['target_kfold']].mean()#try also with min

X_test = join_to_existing(X_test, X_kf, index)
print(X_test.shape)
X_test.head()

(214200, 7)


Unnamed: 0,shop_id,item_id,date_block_num,month,year,target_mean,target_kfold
0,5,5037,34,11,2015,0.928571,2.257832
1,5,5320,34,11,2015,0.0,0.0
2,5,5233,34,11,2015,1.428571,0.3343
3,5,5232,34,11,2015,0.333333,0.3343
4,5,5268,34,11,2015,0.0,0.0


In [12]:
X_loo = X_test.merge(target_loo, on=index_cols)
index = ['shop_id', 'item_id', 'date_block_num', 'month', 'year', 'target_kfold']
X_loo = X_loo.groupby(index, as_index=False)[['target_loo']].mean()#try also with min

X_test = join_to_existing(X_test, X_loo, index)
print(X_test.shape)

(214200, 8)


In [13]:
X_sm = X_test.merge(target_sm, on=index_cols)
index = ['shop_id', 'item_id', 'date_block_num', 'month', 'year', 'target_kfold', 'target_loo']
X_sm = X_sm.groupby(index, as_index=False)[['target_smoothing']].mean()#try also with min

X_test = join_to_existing(X_test, X_sm, index)
print(X_test.shape)

(214200, 9)


In [14]:
X_exp = X_test.merge(target_exp, on=index_cols)
index = ['shop_id', 'item_id', 'date_block_num', 'month', 'year', 'target_kfold', 'target_loo', 'target_smoothing']
X_exp = X_exp.groupby(index, as_index=False)[['target_exp_mean']].mean()#try also with min

X_test = join_to_existing(X_test, X_exp, index)
print(X_test.shape)

(214200, 10)


In [15]:
X_test.head()

Unnamed: 0,shop_id,item_id,date_block_num,month,year,target_mean,target_kfold,target_loo,target_smoothing,target_exp_mean
0,5,5037,34,11,2015,0.928571,2.257832,1.952418,1.735593,2.357574
1,5,5320,34,11,2015,0.0,0.0,0.0,0.0,0.0
2,5,5233,34,11,2015,1.428571,0.3343,1.657611,1.331108,1.073332
3,5,5232,34,11,2015,0.333333,0.3343,1.098958,0.761703,1.376347
4,5,5268,34,11,2015,0.0,0.0,0.0,0.0,0.0


In [16]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [17]:
X_train = X[X.date_block_num < 33]
X_valid = X[X.date_block_num == 33]
y_train = y[:len(X_train)]
y_valid = y[len(X_train):]

So, ready to train and validate, i.e. separate train data into train and validation set and train with xgboost or sth. else. Then prepare test data.

In [18]:
#XGBoost
def xgb_rmse(preds, y):
    y = y.get_label()
    score = np.sqrt(mean_squared_error(y.clip(0.,20.), preds.clip(0.,20.)))
    return 'RMSE', score

In [None]:
params = {'eta': 0.2, 'max_depth': 4, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'seed': 18, 'silent': True}

watchlist = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_valid, y_valid), 'valid')]
xgb_model = xgb.train(params, xgb.DMatrix(X_train, y_train), 100,  watchlist, verbose_eval=10, feval=xgb_rmse, maximize=False, early_stopping_rounds=20)


[0]	train-rmse:3.12202	valid-rmse:5.21715	train-RMSE:1.11179	valid-RMSE:0.970129
Multiple eval metrics have been passed: 'valid-RMSE' will be used for early stopping.

Will train until valid-RMSE hasn't improved in 20 rounds.
[10]	train-rmse:2.45452	valid-rmse:5.02716	train-RMSE:0.902192	valid-RMSE:0.979531


In [None]:
test['item_cnt_month'] = xgb_model.predict(xgb.DMatrix(X_test), ntree_limit=xgb_model.best_ntree_limit)
test[['ID','item_cnt_month']].to_csv('xgb_submission.csv', index=False)