In [0]:
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

## Pre-process Data

In [0]:
train = pd.read_csv('../data/train.csv')

#Basic Cleaning
train = train.drop('InvoiceNo', axis = 1)
mask = train['Quantity'] <= 0
train[mask]['Quatity'] = 0
users_train = np.unique(train.CustomerID)

train['InvoiceDate'] = pd.to_datetime(train['InvoiceDate'])
train['Month'] = train['InvoiceDate'].apply(lambda x : int(x.date().strftime('%m')))
train['total_price'] = train['Quantity'] * train['UnitPrice']

train.head()

Unnamed: 0,CustomerID,Quantity,InvoiceDate,UnitPrice,Country,StockCode,Month,total_price
0,27270,7,2010-01-12 08:26:00,2.55,PX,85123AY,1,17.85
1,27270,7,2010-01-12 08:26:00,3.39,PX,71053R,1,23.73
2,27270,9,2010-01-12 08:26:00,2.75,PX,84406BH,1,24.75
3,27270,7,2010-01-12 08:26:00,3.39,PX,84029GV,1,23.73
4,27270,7,2010-01-12 08:26:00,3.39,PX,84029EX,1,23.73


In [0]:
def dictionary(Stocks):
    prods = np.unique(Stocks, return_counts = False)

    product_dic = {}
    for n, prod in enumerate(prods):
        product_dic[prod] = n
    return product_dic

def trip_vector(transaction, prod_dic):
    trip_vec = np.zeros(len(prod_dic))
    
    for product in set(transaction):
        trip_vec[prod_dic[product]] = 1
    return trip_vec

def DataLoader(train_data, test_data, users, filter_data = True, n_trips = 7):
    n_users = len(users)
    n_prods = len(train_data['trip_vec'].iloc[0])
    
    #Preparing Output data
    y_prods = test_data.groupby(['CustomerID'])['StockCode'].apply(' '.join).reset_index()
    
    #Preparing Input data
    train_data = train_data.sort_values(by = ['CustomerID', 'InvoiceDate'])
    users_k = train_data.groupby('CustomerID').count()['StockCode'].reset_index().values
    if filter_data:
        mask = users_k[:,1] > n_trips
        users_k = users_k[mask]
    max_length = np.max(users_k[:,1])
    x, y = [], []
    for user in users_k:
        mask = train_data['CustomerID'] == user[0]
        mask_t = (y_prods['CustomerID'] == user[0])
        if sum(mask_t) != 0:
            user_trips = train_data[mask]
            y_trips = y_prods[mask_t]
            
            y_trips = y_trips['StockCode'].apply(lambda x : trip_vector(x.split(), prod_dic).astype(int))
            user_trips = user_trips['StockCode'].apply(lambda x : trip_vector(x.split(), prod_dic).astype(int))
            user_trips = np.concatenate(user_trips.values).ravel().reshape((user_trips.shape[0], n_prods))

            x_user = np.zeros((max_length-user[1], n_prods))
            x_user = np.vstack((x_user, user_trips))

            x.append(x_user)
            y.append(np.concatenate(y_trips.values).ravel())
    
    return np.array(x), np.array(y)

In [0]:
prod_dic = dictionary(train.StockCode)

trips = train.groupby(['CustomerID', 'InvoiceDate', 'Month'])['StockCode'].apply(' '.join).reset_index()
mask = trips['Month'] > 6
train_data = trips[~mask]; test_data = trips[mask]

print('Number of transactions in training data: ', train_data.shape[0])
print('Number of transactions in testing data: ', test_data.shape[0])

train_data['trip_vec'] = train_data['StockCode'].apply(lambda x : trip_vector(x.split(), prod_dic).astype(int))
train_data.head()

x, y = DataLoader(train_data, test_data, users_train, filter_data = False)
print('x: ', x.shape, '; y: ', y.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

Number of transactions in training data:  6852
Number of transactions in testing data:  7935
x:  (825, 32, 3810) ; y:  (825, 3810)


## RNN Model

In [0]:
from keras.layers import Dense, Flatten, LSTM
from keras.models import Sequential
from keras.layers.embeddings import Embedding
import keras.backend as K

from keras.callbacks import ModelCheckpoint
from keras.utils.vis_utils import plot_model
from keras.models import load_model

In [0]:
embed_dim = 128
lstm_out = 144
batch_size = 32

model = Sequential()
model.add(LSTM(units = lstm_out, dropout_U = 0.2, dropout_W = 0.2,
               return_sequences = True, input_shape = (x_train.shape[1], 3810)))
model.add(LSTM(units = 128, return_sequences = True))
model.add(LSTM(units = 256, return_sequences = True))
model.add(LSTM(units = 256))
model.add(Dense(x.shape[-1], activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 32, 144)           2278080   
_________________________________________________________________
lstm_6 (LSTM)                (None, 32, 128)           139776    
_________________________________________________________________
lstm_7 (LSTM)                (None, 32, 256)           394240    
_________________________________________________________________
lstm_8 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_2 (Dense)              (None, 3810)              979170    
Total params: 4,316,578
Trainable params: 4,316,578
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
checkpoints = ModelCheckpoint(filepath='../saved_models/nvacc{val_acc:.4f}_e{epoch:02d}.hdf5', 
                              verbose=1,monitor='val_acc', save_best_only=True)

model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size=32,
          verbose=1, callbacks=[checkpoints])

Train on 660 samples, validate on 165 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.96091, saving model to ../saved_models/nvacc0.9609_e01.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.96091
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.96091
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.96091
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.96091
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.96091
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.96091
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.96091
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.96091
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.96091


<keras.callbacks.History at 0x1da5e1780>

In [0]:
best_model = 'nvacc0.9609_e07.hdf5'
threshold = 0.5

model = load_model('../saved_models/' + best_model)
print('Best model loaded!!')

y_pred = model.predict(x_test)

Best model loaded!!


## Filtered Data

In [0]:
x, y = DataLoader(train_data, test_data, users_train, filter_data = True, n_trips = 5)
print('x: ', x.shape, '; y: ', y.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

x:  (445, 32, 3810) ; y:  (445, 3810)


In [0]:
checkpoints = ModelCheckpoint(filepath='../saved_models/nvacc{val_acc:.4f}_e{epoch:02d}.hdf5',
                                  verbose=1,monitor='val_acc', save_best_only=True)

model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 100, batch_size=32,
          verbose=1, callbacks=[checkpoints])

Train on 356 samples, validate on 89 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.93127, saving model to ../saved_models/nvacc0.9313_e01.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.93127 to 0.93147, saving model to ../saved_models/nvacc0.9315_e02.hdf5
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.93147
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.93147
Epoch 5/100

Epoch 00005: val_acc improved from 0.93147 to 0.93155, saving model to ../saved_models/nvacc0.9315_e05.hdf5
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.93155
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.93155
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.93155
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.93155
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.93155
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.93155
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.93155
Epoch 13/100

Epoch 00013


Epoch 00086: val_acc did not improve from 0.93159
Epoch 87/100

Epoch 00087: val_acc did not improve from 0.93159
Epoch 88/100

Epoch 00088: val_acc did not improve from 0.93159
Epoch 89/100

Epoch 00089: val_acc did not improve from 0.93159
Epoch 90/100

Epoch 00090: val_acc did not improve from 0.93159
Epoch 91/100

Epoch 00091: val_acc did not improve from 0.93159
Epoch 92/100

Epoch 00092: val_acc did not improve from 0.93159
Epoch 93/100

Epoch 00093: val_acc did not improve from 0.93159
Epoch 94/100

Epoch 00094: val_acc did not improve from 0.93159
Epoch 95/100

Epoch 00095: val_acc did not improve from 0.93159
Epoch 96/100

Epoch 00096: val_acc did not improve from 0.93159
Epoch 97/100

Epoch 00097: val_acc did not improve from 0.93159
Epoch 98/100

Epoch 00098: val_acc did not improve from 0.93159
Epoch 99/100

Epoch 00099: val_acc did not improve from 0.93159
Epoch 100/100

Epoch 00100: val_acc did not improve from 0.93159


<keras.callbacks.History at 0x1da5ef780>

In [0]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

users_test = np.unique(test.CustomerID)

data = pd.concat([train, test], sort=False)
del train, test

prods = np.unique(data.StockCode, return_counts = True)

In [0]:
trips = data.groupby(['CustomerID', 'InvoiceDate'])['StockCode'].apply(', '.join).reset_index()
trips.tail()

Unnamed: 0,CustomerID,InvoiceDate,StockCode
22857,577215,02/12/11 2:02:00 PM,46000MP
22858,577215,08/12/11 1:36:00 PM,"22429T, 84378P, 23250A, 22720A, 23366F, 22972W..."
22859,578025,27/11/11 4:18:00 PM,"22483A, 23583L, 21793O"
22860,579015,01/12/11 3:11:00 PM,"23499W, 22831H, 22079R"
22861,580815,07/12/11 8:03:00 AM,"22961C, 21531U, 22627E, 22625P, 22960K, 22554P..."
