# Sales Prediction for Time Series Data

## Part 4: Models (Neural Network)

In [2]:
import numpy as np
import pandas as pd 
import os
import time 
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model
import h5py

%matplotlib inline 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def submission(model, X_test):
    '''
    make submission file
    arguments:    model = model name 
                  X_test= X_test name
    return: a file saved in directory with timestamp
    '''
    # model prediction
    pred = model.predict(X_test)
    print('mean before clipping: ', pred.mean())
    pred = pred.clip(0,20)
    print('mean after clipping: ', pred.mean())

    # create prediction dataframe
    ID = joblib.load('ID.pkl')
    predDF = pd.DataFrame() 
    predDF['ID'] = ID
    predDF['item_cnt_month'] = pred
    print(predDF.head())

    # write dataframe to csv
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%m%d_%H.%M')
    print('submission_' + st + '.csv')
    
    predDF.to_csv(header=True, index=False, path_or_buf = 'submission_' + st + '.csv')
    
    return None

In [4]:
DATA_FOLDER = '../data/'
X_train= pd.read_pickle(DATA_FOLDER+'X_train_lev_1_standardScaler')
y_train = joblib.load(DATA_FOLDER+'y_train_lev_1.pkl')
X_test = pd.read_pickle(DATA_FOLDER+'X_test_lev_1_standardScaler')
y_test = joblib.load(DATA_FOLDER+'y_val_lev_1.pkl')
X_train_full = pd.read_pickle(DATA_FOLDER+'X_train_standardScaler')
X_test_full = pd.read_pickle(DATA_FOLDER+'X_test_standardScaler')
y_train_full = joblib.load(DATA_FOLDER+'y_train_full.pkl')

Sample 20% of all items

In [5]:
sampleSize = 0.01 # 33 items
sample= list(X_train['item_id'].unique()) + list(X_test['item_id'].unique()) 
sample = list(set(sample))
np.random.seed(1234)
sample= list(np.random.choice(sample, size= int(len(sample)*sampleSize), replace=False, p=None))

Narrow down the training set.  

In [6]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

trainIndex = X_train.loc[(X_train['item_id'].isin(sample))].index
testIndex = X_test.loc[(X_test['item_id'].isin(sample))].index

X_train_s = X_train.loc[trainIndex]
X_test_s = X_test.loc[testIndex]
y_train_s = y_train[trainIndex]
y_test_s = y_test[testIndex]

Convert to arrays

In [7]:
X_train_s_m = X_train_s.as_matrix(columns= X_train_s.columns)
X_test_m = X_test.as_matrix(columns = X_test_s.columns)
X_train_m = X_train.as_matrix(columns= X_train.columns)
X_train_full_m = X_train_full.as_matrix(columns = X_train_full.columns)

In [8]:
print(X_train_s_m.shape)
print(X_test_m.shape)
print(y_train_s.size)
print(y_test.size)
print(X_train_full_m.shape)

(11354, 235)
(43344, 235)
11354
43344
(6425094, 235)


Define base model

In [9]:
def base_model():
    # create model
    model = Sequential()
    model.add(Dense(64, input_dim=235, kernel_initializer='normal', activation='relu'))
    model.add(Dense(32, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

Define Model Checkpoint: saves the model weights after each epoch if the validation loss decreased

In [10]:
filepath= 'nnBestModel.hdf5'
checkpoint = ModelCheckpoint(filepath, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=False,
                             mode='auto')


Define Early Stopping: Stop training when a monitored quantity has stopped improving.

In [11]:
early_stop = EarlyStopping(monitor='val_loss', 
                           min_delta = 0.0001, #minimum change in the monitored quantity to qualify as an improvement,
                           patience= 5, #number of epochs with no improvement after which training will be stopped.
                           mode='auto') 

In [39]:
callbacks_list = [checkpoint, early_stop]
model = KerasRegressor(build_fn=base_model, verbose= True)
history = model.fit(X_train_m, y_train, validation_data=(X_test_m, y_test), epochs=50, batch_size=5, callbacks=callbacks_list)

Train on 1215879 samples, validate on 43387 samples
Epoch 1/50
Epoch 00001: val_loss improved from 0.90202 to 0.83240, saving model to nnBestModel.hdf5
Epoch 2/50
Epoch 00002: val_loss did not improve
Epoch 3/50
Epoch 00003: val_loss did not improve
Epoch 4/50
Epoch 00004: val_loss did not improve
Epoch 5/50
Epoch 00005: val_loss did not improve
Epoch 6/50
Epoch 00006: val_loss did not improve


Load previously trained model Weights (the best model from before)

In [41]:
model = load_model("nnBestModel.hdf5")
scores = model.evaluate(X_test_m, y_test)
print(scores)
MSE = mean_squared_error(y_test, model.predict(X_test_m))
RMSE = np.sqrt(MSE)
print('Test_sample MSE:', MSE)
print('Test_sample RMSE:', RMSE)

0.8323987960203406
Test_sample MSE: 0.8323988
Test_sample RMSE: 0.9123589


Submit Model (trained on subset of data)

In [42]:
submission(model, X_test_full) # 0.96402

mean before clipping:  0.35371935
mean after clipping:  0.359124
   ID  item_cnt_month
0   0        0.445329
1   1        0.329181
2   2        0.749330
3   3        0.571687
4   4        5.889728
submission_0130_15.35.csv


#### Train on full set of data

In [13]:
model = KerasRegressor(build_fn=base_model, verbose= True)
model.fit(X_train_full_m, y_train_full, epochs = 1, batch_size=5)

Epoch 1/1


AttributeError: 'KerasRegressor' object has no attribute 'save'

In [15]:
submission(model, X_test_full) # 0.94596

mean before clipping:  0.17258075
mean after clipping:  0.24591118
   ID  item_cnt_month
0   0        0.299153
1   1        0.369452
2   2        0.851133
3   3        0.298807
4   4        5.882136
submission_0131_19.37.csv


In [18]:
model.model.save('nn_full_model.h5') 