In [4]:
import sys
sys.path.append("../")

import warnings
warnings.filterwarnings("ignore")

In [122]:
import time
import h5py
import os
from spotlight.interactions import Interactions
import hashlib
import json
import os
import shutil
import sys
import numpy as np
import torch
from sklearn.model_selection import ParameterSampler
from spotlight.cross_validation import user_based_train_test_split
from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.sequence.representations import CNNNet
from spotlight.evaluation import sequence_mrr_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
random_state = np.random.RandomState(100)

from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import mrr_score
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.evaluation import precision_recall_score

In [8]:
from IPython.display import SVG, display
import matplotlib.pyplot as plt
import seaborn as sns
from reco.preprocess import encode_user_item, random_split, user_split

%matplotlib inline

# Prepare the data

In [2]:
df = pd.read_csv("/tf/notebooks/data/yoochoose/rating.csv") 

In [3]:
df.head()

Unnamed: 0,SessionId,ItemId,ItemType,Action,Time,Rating
0,87,214840483,NONE,BUY,1396852000.0,5
1,87,214840483,NONE,BUY,1396852000.0,5
2,87,214717286,NONE,CLICK,1396852000.0,1
3,87,214558807,NONE,CLICK,1396852000.0,1
4,87,214821300,NONE,CLICK,1396852000.0,1


In [9]:
# Data Encoding
DATA, user_encoder, item_encoder = encode_user_item(df, "SessionId", "ItemId", "Rating", "Time")

Number of users:  42144
Number of items:  5120


In [53]:
# Spotlight requires encoders to begin from 1 (instead of 0). We will add 1 to the encoders 
# When doing inverse transform, remember to subtract 1.

DATA.USER = DATA.USER + 1
DATA.ITEM = DATA.ITEM + 1

In [55]:
DATA.RATING = DATA.RATING.astype(np.int32)
DATA.USER = DATA.USER.astype(np.int32)
DATA.ITEM = DATA.ITEM.astype(np.int32)

In [56]:
DATA.head()

Unnamed: 0,SessionId,ItemId,ItemType,Action,TIMESTAMP,RATING,USER,ITEM
0,87,214840483,NONE,BUY,1396852000.0,5,1,3844
1,87,214840483,NONE,BUY,1396852000.0,5,1,3844
2,87,214717286,NONE,CLICK,1396852000.0,1,1,2574
3,87,214558807,NONE,CLICK,1396852000.0,1,1,703
4,87,214821300,NONE,CLICK,1396852000.0,1,1,2990


In [57]:
df_for_interaction_matrix = (DATA.USER.values,DATA.ITEM.values,DATA.RATING,DATA.TIMESTAMP)

In [58]:
df_interaction = Interactions(*df_for_interaction_matrix)

# Create train, val and test dataset

In [86]:
train_with_val, test = user_based_train_test_split(df_interaction,
                                          random_state=random_state, test_percentage = 0.2)

train, val = user_based_train_test_split(train_with_val, test_percentage=0.2, random_state=random_state)

# Implicit Model

In [88]:
model_implicit = ImplicitFactorizationModel(n_iter=3,
                                   loss='bpr')

In [90]:
%%time
model_implicit.fit(train)

CPU times: user 6min 4s, sys: 7.6 s, total: 6min 11s
Wall time: 1min 58s


### Prediction

In [111]:
user_for_reco = test.user_ids[0]
print(user_for_reco)

2


In [112]:
pred_for_user = model_implicit.predict(user_for_reco)

In [113]:
pred_for_user

array([-3.3767548 ,  0.23255795, -1.3784684 , ...,  0.28644297,
       -2.2947752 ,  1.0673456 ], dtype=float32)

In [114]:
rec_item_ids = (-pred).argsort()

In [115]:
rec_item_ids

array([2671, 4051, 4533, ..., 2755,  195, 2953])

In [117]:
# ground truth
target = test.item_ids[0]
target

3727

In [118]:
np.where(rec_item_ids == target)

(array([2529]),)

### Evaluation

In [125]:
%%time
implicit_mrr_score = mrr_score(model_implicit, test)

CPU times: user 21.4 s, sys: 68 ms, total: 21.4 s
Wall time: 10.7 s


In [126]:
implicit_mrr_score

array([0.0239379 , 0.00208317, 0.06231011, ..., 0.00373766, 0.07114779,
       0.00057579])

In [124]:
%%time
(pk, rk) = precision_recall_score(model_implicit, test, k= 5)

CPU times: user 18.7 s, sys: 64 ms, total: 18.8 s
Wall time: 9.39 s


# Sequence Model

In [85]:
max_sequence_length = 200
min_sequence_length = 50
step_size = 200

In [65]:
train = train.to_sequence(max_sequence_length=max_sequence_length,
                          min_sequence_length=min_sequence_length,
                          step_size=step_size)
test = test.to_sequence(max_sequence_length=max_sequence_length,
                        min_sequence_length=min_sequence_length,
                        step_size=step_size)
val = val.to_sequence(max_sequence_length=max_sequence_length,
                                    min_sequence_length=min_sequence_length,
                                    step_size=step_size)

In [67]:
print(train.sequences.shape)
print(test.sequences.shape)
print(val.sequences.shape)

(1595, 200)
(490, 200)
(390, 200)


In [68]:
net = CNNNet(train.num_items,
             embedding_dim=128,
             kernel_width=3,
             dilation=[1,1,1,1],
             num_layers=2,
             nonlinearity="relu",
             residual_connections=False)

In [69]:
model = ImplicitSequenceModel(loss="bpr",
                              representation=net,
                              batch_size=32,
                              learning_rate=0.1,
                              l2=0.0,
                              n_iter=2,
                              random_state=random_state)

In [70]:
%%time
model.fit(train)

CPU times: user 2min 32s, sys: 200 ms, total: 2min 33s
Wall time: 1min 16s


### Prediction

In [71]:
query = test.sequences[1][0:199]
target = test.sequences[1][199]

print("Shape of query is : ",query.shape)
print("The value of target is : ",target)

Shape of query is :  (199,)
The value of target is :  2579


In [72]:
pred = model.predict(query)

In [73]:
rec_item_ids = (-pred).argsort()

In [83]:
# Prediction scores

In [74]:
pred

array([      0.   , -105572.055,  -38352.793, ...,   10967.484,
        -36471.242,  128064.086], dtype=float32)

In [81]:
# Sorted order of the recommendations

In [75]:
rec_item_ids

array([2671, 4051, 4533, ..., 2755,  195, 2953])

In [80]:
### Finding the position of our actual ground truth in the prediction:

In [76]:
np.where(rec_item_ids == target)

(array([1647]),)

In [79]:
### Item ID that is to be recommended :

In [78]:
item_encoder.inverse_transform([rec_item_ids[0]-1])[0]

214718379