# This is the part 2 of a 2 part notebook, continuing with the modeling and submission process.

# Part 1 covered data exploration and feature creation.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import keras
import sklearn

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("xgboost version:", xgb.__version__)
print("sklearn version:", sklearn.__version__)
print("keras version:", keras.__version__)

pandas version: 0.22.0
numpy version: 1.14.1
xgboost version: 0.7.post3
sklearn version: 0.19.1
keras version: 2.1.3


In [2]:
data = pd.read_hdf("../data/processed_data.h5", key="data")

In [3]:
print(data.shape)
data.head()

(11128050, 37)


Unnamed: 0,ID,date_block_num,item_cat_enc,item_category_id,item_id,item_name,item_target_enc,shop_id,target,item_lag1,...,total_item_sales_lag3,total_item_sales_lag4,total_item_sales_lag5,total_item_sales_lag12,total_cat_sales_lag1,total_cat_sales_lag2,total_cat_sales_lag3,total_cat_sales_lag4,total_cat_sales_lag5,total_cat_sales_lag12
0,,0,0.254509,40,19,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,0.022222,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,,0,0.764905,19,27,"007 Legends [PS3, русская версия]",0.056834,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,,0,1.213606,30,28,"007 Legends [PС, Jewel, русская версия]",0.141176,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,,0,0.674349,23,29,"007 Legends [Xbox 360, русская версия]",0.037383,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,,0,0.254509,40,32,1+1,1.342412,0,6.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


# Let's add a few extra features to the model, namely an indictator if it's the first time a product is appearing and also the sum and mean of the lagged supply features.

In [4]:
data = data.merge(data.groupby("item_id", as_index=False).date_block_num.min().rename(columns={"date_block_num":"first_appearance"}), how="left")

In [5]:
data["months_since_first_appearance"] = data.date_block_num - data.first_appearance
data["months_since_first_appearance_inv"] = 1/data.months_since_first_appearance
data.months_since_first_appearance_inv.replace(np.inf, -1, inplace=True)
data["product_launch"] = 0
data.loc[data.months_since_first_appearance == 0, "product_launch"] = 1

In [6]:
data["item_lag_mean"] = data[[x for x in data.columns if "item_lag" in x]].mean(axis=1)
data["item_lag_sum"] = data[[x for x in data.columns if "item_lag" in x]].sum(axis=1)

In [7]:
data.head()

Unnamed: 0,ID,date_block_num,item_cat_enc,item_category_id,item_id,item_name,item_target_enc,shop_id,target,item_lag1,...,total_cat_sales_lag3,total_cat_sales_lag4,total_cat_sales_lag5,total_cat_sales_lag12,first_appearance,months_since_first_appearance,months_since_first_appearance_inv,product_launch,item_lag_mean,item_lag_sum
0,,0,0.254509,40,19,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,0.022222,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0,0,-1.0,1,-1.0,-7.0
1,,0,0.764905,19,27,"007 Legends [PS3, русская версия]",0.056834,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0,0,-1.0,1,-1.0,-7.0
2,,0,1.213606,30,28,"007 Legends [PС, Jewel, русская версия]",0.141176,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0,0,-1.0,1,-1.0,-7.0
3,,0,0.674349,23,29,"007 Legends [Xbox 360, русская версия]",0.037383,0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0,0,-1.0,1,-1.0,-7.0
4,,0,0.254509,40,32,1+1,1.342412,0,6.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0,0,-1.0,1,-1.0,-7.0


In [8]:
import gc
gc.collect()

121

# There isn't enough memory to fit an xgboost model to the entire dataset, so lets remove the first year of data. Also, since values are clipped to be between (0,20) let fit to values between (0,40) so that we are closer to the target distribution. Why not clip the training data to 20 as well? Well if we fit to 20, the model will predict values under 20, but instead we want values to be forecasted slightly above 20, then clip to 20.

# Trying to fit models over all the features also resulted in memory errors, so models were instead fit to each subset of lagged features and evaluated. The sets of lagged features are item lag, total item sales lag, total category sales lag, and total shop sales lag. 

In [22]:
train = data[(data.date_block_num > 12)&(data.date_block_num < 33)] #use smaller dataset for cv

lagged_features = ["total_item_sales", "total_cat_sales", "total_shop_sales"]
predictors = [x for x in train.columns if x not in ["target","item_name","date_block_num", "ID", "item_id"] + lagged_features and "lag" not in x]
predictors += [x for x in train.columns if "item_lag" in x]
print(predictors)

params = {
    "n_estimators": [76],
    "max_depth": [4,5,6,7],
    "learning_rate": [.089]
}

model = xgb.XGBRegressor()
tscv = TimeSeriesSplit(n_splits=5)
train = train.set_index("date_block_num")
cv = tscv.split(train)
gs = GridSearchCV(model, params, cv=cv, verbose=2, n_jobs=5)
gs.fit(train[predictors], train.target.clip(0,40))

['item_cat_enc', 'item_category_id', 'item_target_enc', 'shop_id', 'shop_enc', 'first_appearance', 'months_since_first_appearance', 'months_since_first_appearance_inv', 'product_launch', 'item_lag1', 'item_lag2', 'item_lag3', 'item_lag4', 'item_lag5', 'item_lag12', 'item_lag_mean', 'item_lag_sum']
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] learning_rate=0.089, max_depth=4, n_estimators=76 ...............
[CV] learning_rate=0.089, max_depth=4, n_estimators=76 ...............
[CV] learning_rate=0.089, max_depth=4, n_estimators=76 ...............
[CV] learning_rate=0.089, max_depth=4, n_estimators=76 ...............
[CV] learning_rate=0.089, max_depth=4, n_estimators=76 ...............
[CV]  learning_rate=0.089, max_depth=4, n_estimators=76, total= 1.5min
[CV] learning_rate=0.089, max_depth=5, n_estimators=76 ...............
[CV]  learning_rate=0.089, max_depth=4, n_estimators=76, total= 3.1min
[CV] learning_rate=0.089, max_depth=5, n_estimators=76 ...............
[C

[Parallel(n_jobs=5)]: Done  20 out of  20 | elapsed: 32.2min finished


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x7f1f11130830>,
       error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=5,
       param_grid={'n_estimators': [76], 'max_depth': [4, 5, 6, 7], 'learning_rate': [0.089]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [23]:
print(gs.best_score_)
print(gs.best_params_)

0.4800194223180099
{'learning_rate': 0.089, 'max_depth': 4, 'n_estimators': 76}


In [25]:
def rmse(y1, y2):
    return np.sqrt(np.mean((np.ravel(y1) - np.ravel(y2))**2))

valid = data[data.date_block_num == 33]
rmse(valid.target.clip(0,20),gs.best_estimator_.predict(valid[predictors]).clip(0,20))

# Now let's save promising models for ensembling later. 30-40 models later,  only the item lag model was able to achieve good scores and so let's just save those. 

In [3]:
import pickle

In [21]:
pickle.dump(gs.best_estimator_, open("models2/item_lag76-3-.089-0.96.pkl", "wb"))

In [28]:
pickle.dump(gs.best_estimator_, open("models2/item_lag76-4-.089-0.94.pkl", "wb"))

# Now in order to make use of all the features and all of the data, let's fit a neural net as neural nets can take advantage of batch learning. The neural net will be fit over the entire dataset(before valid/test blocks) and over all the features.

# For the neural net, numerical features will be scaled between (0, 1) and and categorical features will be one-hot encoded.

In [39]:
train = data[(data.date_block_num < 33)]
valid = data[data.date_block_num == 33]

predictors = [x for x in train.columns if x not in ["product_launch","ID","shop_id","item_id","item_name","item_category_id","target", "total_shop_sales","total_item_sales","total_cat_sales"]]
predictors

['date_block_num',
 'item_cat_enc',
 'item_target_enc',
 'item_lag1',
 'item_lag2',
 'item_lag3',
 'item_lag4',
 'item_lag5',
 'item_lag12',
 'shop_enc',
 'total_shop_sales_lag1',
 'total_shop_sales_lag2',
 'total_shop_sales_lag3',
 'total_shop_sales_lag4',
 'total_shop_sales_lag5',
 'total_shop_sales_lag12',
 'total_item_sales_lag1',
 'total_item_sales_lag2',
 'total_item_sales_lag3',
 'total_item_sales_lag4',
 'total_item_sales_lag5',
 'total_item_sales_lag12',
 'total_cat_sales_lag1',
 'total_cat_sales_lag2',
 'total_cat_sales_lag3',
 'total_cat_sales_lag4',
 'total_cat_sales_lag5',
 'total_cat_sales_lag12',
 'first_appearance',
 'months_since_first_appearance',
 'months_since_first_appearance_inv',
 'item_lag_mean',
 'item_lag_sum']

In [40]:
gc.collect()

3200

In [44]:
X = train[predictors].values
y = train.target.clip(0,40)

X_valid = valid[predictors].fillna(0).values
y_valid = valid.target.clip(0,40)

In [45]:
X.shape

(10675678, 33)

In [46]:
from sklearn.preprocessing import MinMaxScaler

In [47]:
mm_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

X = mm_scaler.fit_transform(X)
X_valid = mm_scaler.transform(X_valid)

y = y_scaler.fit_transform(y.values.reshape(-1,1))
y_valid = y_scaler.transform(y_valid.values.reshape(-1,1))

In [48]:
shop_id_dummies = pd.get_dummies(data.shop_id)
item_cat_dummies = pd.get_dummies(data.item_category_id)

In [49]:
def batch_generator(X, y, batchsize=512):
    while True:
        idx = np.random.choice(np.arange(X.shape[0]), batchsize, replace=False)
        X_batch, y_batch = X[idx], y[idx]
        X_batch = np.hstack([X_batch, shop_id_dummies.iloc[idx], item_cat_dummies.iloc[idx], 
                             data.iloc[idx].product_launch.values.reshape(-1,1)])
        yield X_batch, y_batch

In [50]:
data_gen = batch_generator(X, y)

In [51]:
X_valid = np.hstack([X_valid, shop_id_dummies.loc[valid.index], item_cat_dummies.loc[valid.index], 
                     data.loc[valid.index].product_launch.values.reshape(-1,1)])

In [52]:
X_valid.shape

(238172, 178)

In [53]:
import gc
gc.collect()

304

In [54]:
from keras.layers import Dense, Input, Dropout
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import ModelCheckpoint

In [60]:
inp = Input((X_valid.shape[1],))
x = Dense(512, activation="relu")(inp)
x = Dense(512, activation="relu")(x)
x = Dense(16, activation="relu")(x)
x = Dense(1, kernel_regularizer=regularizers.l2(0.001))(x)

callbacks = [
    ModelCheckpoint("models2/512-512-16-1-l2001.h5", save_best_only=True, verbose=1)
]

model = Model(inputs=inp, outputs=x)
model.compile(optimizer="adam", loss="mean_squared_error")

In [61]:
model.fit_generator(data_gen, steps_per_epoch=100, epochs=600, validation_data=(X_valid, y_valid), callbacks=callbacks)

Epoch 1/600
Epoch 00001: val_loss improved from inf to 0.00305, saving model to models2/512-512-16-1-l2001.h5
Epoch 2/600
Epoch 00002: val_loss improved from 0.00305 to 0.00251, saving model to models2/512-512-16-1-l2001.h5
Epoch 3/600
Epoch 00003: val_loss improved from 0.00251 to 0.00210, saving model to models2/512-512-16-1-l2001.h5
Epoch 4/600
Epoch 00004: val_loss improved from 0.00210 to 0.00186, saving model to models2/512-512-16-1-l2001.h5
Epoch 5/600
Epoch 00005: val_loss improved from 0.00186 to 0.00177, saving model to models2/512-512-16-1-l2001.h5
Epoch 6/600
Epoch 00006: val_loss improved from 0.00177 to 0.00143, saving model to models2/512-512-16-1-l2001.h5
Epoch 7/600
Epoch 00007: val_loss improved from 0.00143 to 0.00137, saving model to models2/512-512-16-1-l2001.h5
Epoch 8/600
Epoch 00008: val_loss improved from 0.00137 to 0.00120, saving model to models2/512-512-16-1-l2001.h5
Epoch 9/600
Epoch 00009: val_loss improved from 0.00120 to 0.00115, saving model to models2/

Epoch 34/600
Epoch 00034: val_loss did not improve
Epoch 35/600
 16/100 [===>..........................] - ETA: 17s - loss: 5.9062e-04

KeyboardInterrupt: 

## Fitting neural nets is a very time consuming process and many iterations later a set of weights and architecture were finally found that achieved decent scores. For brevity, only the final model is included here.

In [63]:
model = load_model("models2/512-512-16-1-l2001.h5")

In [64]:
rmse(valid.target.clip(0,20), y_scaler.inverse_transform(model.predict(X_valid)).clip(0,20))

0.9883868533546424

In [113]:
import gc
gc.collect()

1784

# Model Stacking

In [65]:
test = data[data.date_block_num == 34]

In [68]:
X_test = np.hstack([mm_scaler.transform(test[predictors].fillna(0)), shop_id_dummies.loc[test.index], item_cat_dummies.loc[test.index], 
                   data.loc[test.index].product_launch.values.reshape(-1,1)])

In [69]:
out = test[["ID"]].copy()
out["item_cnt_month"] = model.predict(X_test)
out.ID = out.ID.astype(int)
out.item_cnt_month = y_scaler.inverse_transform(out.item_cnt_month.values.reshape(-1,1)).clip(0,20)

In [77]:
xgb_model = pickle.load(open("models2/item_lag76-4-.089-0.94.pkl", "rb"))

In [75]:
xgb_predictors = ['item_cat_enc', 'item_category_id', 'item_target_enc', 'shop_id', 'shop_enc', 'first_appearance', 'months_since_first_appearance', 'months_since_first_appearance_inv', 'product_launch', 'item_lag1', 'item_lag2', 'item_lag3', 'item_lag4', 'item_lag5', 'item_lag12', 'item_lag_mean', 'item_lag_sum']

In [87]:
y_valid_xgb = xgb_model.predict(valid[xgb_predictors]).clip(0,20)
y_valid_nn = y_scaler.inverse_transform(model.predict(X_valid).reshape(-1,1)).clip(0,20).ravel()

y_test_xgb = xgb_model.predict(test[xgb_predictors]).clip(0,20)
y_test_nn = y_scaler.inverse_transform(model.predict(X_test).reshape(-1,1)).clip(0,20).ravel()

In [123]:
X_ens_valid = np.vstack([y_valid_xgb, y_valid_nn]).T
X_ens_valid.shape

(238172, 2)

## Here 2 methods are explored for stacking the xgb and nn output. A simple linear combination is done over gridsearch to find the best alpha for combining the 2 models:
### alpha \* predictions1 + (1-a) \* predictions2

## and the same is done but using gridsearch to find the best alpha over 5-folds.

In [79]:
alphas = np.linspace(.001,1.0,1000)
best_score = None
best_alpha = None
for a in alphas:
    predict = (a * y_valid_xgb + (1-a)*y_valid_nn).clip(0,20)
    score = rmse(valid.target.clip(0,20), predict)
    if best_score is None or score < best_score:
        best_score = score
        best_alpha = a

In [94]:
best_alpha

0.617

In [111]:
from sklearn.base import BaseEstimator, RegressorMixin

Let's make a model so that we can use sklearn's gridsearch to search over alpha parameters

In [151]:
class EnsembleModel(BaseEstimator, RegressorMixin):
    def __init__(self, alpha=0.5):
        self.alpha = alpha
        
    def fit(self, X, y):
        pass
        
    def predict(self, X):
        return (self.alpha * X[:,0] + (1-self.alpha)*X[:,1]).clip(0,20)

In [152]:
ens_model = EnsembleModel()
params = {"alpha":np.linspace(0.001,1.0,1000)}

In [153]:
from sklearn.model_selection import GridSearchCV

In [158]:
gs = GridSearchCV(ens_model, params, cv=5, verbose=1, scoring="neg_mean_squared_error")
gs.fit(X_ens_valid, y_valid.clip(0,20))

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:   28.5s finished


GridSearchCV(cv=5, error_score='raise', estimator=EnsembleModel(alpha=0.5),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.001, 0.002, ..., 0.999, 1.   ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [159]:
print(gs.best_estimator_)
print(gs.best_score_)

EnsembleModel(alpha=0.8240000000000001)
-0.5426809223187207


In [160]:
rmse(y_valid.clip(0,20), gs.best_estimator_.predict(X_ens_valid).clip(0,20))

0.7366688009673823

In [89]:
#out.item_cnt_month = (best_alpha * y_test_xgb + (1-best_alpha)*y_test_nn).clip(0,20)

In [148]:
out.item_cnt_month = gs.best_estimator_.predict(np.vstack([y_test_xgb, y_test_nn]).T)

In [149]:
out.to_csv("submission_nn_xgb3.csv",index=False)

In [150]:
!kaggle competitions submit -c competitive-data-science-final-project -f submission_nn_xgb3.csv -m "nn xgb 3"

Successfully submitted to Final project: predict future sales

# End result: the non-cv stack achieved a score of .96 and the cv-stack achieved a score of .97 on the leaderboard.