In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np  
import pandas as pd
from keras.models import Sequential  
from tensorflow.keras.layers import Embedding
import numpy as np
from keras.models import Model
from keras.layers import Concatenate
from keras.layers import Dense, concatenate, Input
from keras.layers import LSTM
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
import statistics
import tensorflow as tf


### calculate precision between two lists

In [2]:
def precision(actual, predicted, k):
    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(pred_set.intersection(actual)) / float(k)
    return result

In [3]:
def mapk(actual, predicted):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
            
    """

        
    return np.mean([precision(a,np.argmax(p,axis=-1),5) for a,p in zip(actual, predicted)])

In [4]:

def create_train_tfdata(train_feat_dict, train_target_tensor,
                        batch_size, buffer_size=None):
    """
    Create train tf dataset for model train input
    :param train_feat_dict: dict, containing the features tensors for train data
    :param train_target_tensor: np.array(), the training TARGET tensor
    :param batch_size: (int) size of the batch to work with
    :param buffer_size: (int) Optional. Default is None. Size of the buffer
    :return: (tuple) 1st element is the training dataset,
                     2nd is the number of steps per epoch (based on batch size)
    """
    if buffer_size is None:
        buffer_size = batch_size*50

    train_steps_per_epoch = len(train_target_tensor) // batch_size

    train_dataset = tf.data.Dataset.from_tensor_slices((train_feat_dict,train_target_tensor)).cache()
    train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
    train_dataset = train_dataset.repeat().prefetch(tf.data.experimental.AUTOTUNE)
    
    return train_dataset, train_steps_per_epoch


### defining model

In [5]:
def model(views_lenght,items_lenght,seq_lenght):
    #model=Sequential()
    #max lenght of sequence 
    inp1=Input(shape=(seq_lenght,),dtype=tf.int32,name='itemid')
    #the input dimensions of items is the tokenizer lenght
    embedding_layer1 = Embedding(input_dim=items_lenght, output_dim=100)(inp1)
    inp2=Input(shape=(seq_lenght,),dtype=tf.int32,name='views')
    # and number of unique views different views values
    embedding_layer2 = Embedding(input_dim=views_lenght, output_dim=100)(inp2)
    # concating layers for one input to lstm
    concat_embedding_input = tf.keras.layers.Concatenate(name='concat_embedding_input')([embedding_layer1, embedding_layer2])
    
    rnn =LSTM(100,return_sequences=True)(concat_embedding_input)
    output = Dense(items_lenght, activation='softmax')(rnn)

    opt = tf.keras.optimizers.RMSprop(learning_rate=0.05)

    model = Model(inputs=[inp1, inp2], outputs=output)
    model.compile(optimizer=opt,loss='sparse_categorical_crossentropy')
    return model

### loading dataset

In [6]:
items=pd.read_csv(r'C:/Users/ahmed hatem/Downloads/archive (4)/item_recommendation.csv')

In [7]:
items=items.drop(['Unnamed: 0'],axis=1)

In [8]:
items

Unnamed: 0,visitorid,itemid,number of views
0,0,67045,1
1,0,285930,1
2,0,357564,1
3,1,72028,1
4,2,216305,2
...,...,...,...
2132122,1407575,121220,1
2132123,1407576,356208,1
2132124,1407577,427784,1
2132125,1407578,188736,1


### changing the data type into strings to be grouped

In [9]:
items=items.astype(str)

In [10]:
itemid_grouped=items.groupby('visitorid')['itemid'].apply(','.join).reset_index()

In [11]:
views_grouped=items.groupby('visitorid')['number of views'].apply(','.join).reset_index()

In [12]:
itemandviews_grouped=pd.merge(itemid_grouped,views_grouped,on='visitorid')

### the data contains the visitor id and all items viewed and the number of views of each item for that visitor

In [13]:
itemandviews_grouped

Unnamed: 0,visitorid,itemid,number of views
0,0,67045285930357564,111
1,1,72028,1
2,10,248766,1
3,100,36054,4
4,1000,248975,1
...,...,...,...
1404174,999995,109661,1
1404175,999996,2821951259182000371008,1221
1404176,999997,196715,1
1404177,999998,185989226373340010455860,1211


In [14]:
itemandviews_grouped['lenght'] = itemandviews_grouped['number of views'].str.len()

In [15]:
itemandviews_grouped=itemandviews_grouped[itemandviews_grouped.lenght>1]

In [16]:
itemandviews_grouped[itemandviews_grouped['number of views'].str.count(",").add(1).lt(10)] 

Unnamed: 0,visitorid,itemid,number of views,lenght
0,0,67045285930357564,111,5
5,10000,359491401285,43,3
8,1000001,67707141264202293230432424515,11111,9
10,1000003,150875228932,11,3
26,1000018,19366256393278214,111,5
...,...,...,...,...
1404164,999986,269229315140,13,3
1404169,999990,41968317016,11,3
1404170,999991,163350271890298056,112,5
1404175,999996,2821951259182000371008,1221,7


In [17]:
itemandviews_grouped=itemandviews_grouped[itemandviews_grouped["lenght"]>10]

In [18]:
itemandviews_grouped

Unnamed: 0,visitorid,itemid,number of views,lenght
109,1000093,119825172313192353199101286219304028346501,1125113,13
169,1000147,64253126126312899373472421062427744441901,3111131,13
187,1000163,2071037184136119223277358854418941440220,3111111,13
195,1000170,"134053,217869,218296,253256,273417,277105,3362...",11111111,15
229,1000200,4817953206283624391014393338404406,121112,11
...,...,...,...,...
1404035,999869,4777965391172048174724300433306816390465,1111121,13
1404045,999878,"25091,36127,98052,135986,165677,170857,171571,...",2111111211211111,31
1404120,999945,"3764,25800,47509,71798,119547,205201,290204,31...",1111111111111111,31
1404126,999951,23969747765698268795412351138,111122,11


In [19]:
itemandviews_grouped=itemandviews_grouped[:10000]

In [20]:
num_words = 1000000
oov_token = '<UNK>'
pad_type = 'pre'
trunc_type = 'pre'

### tokenizing the items

In [21]:
tokenizer_items = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer_items.fit_on_texts(itemandviews_grouped['itemid'])

In [22]:
len(tokenizer_items.word_index)

50113

In [23]:
items = tokenizer_items.texts_to_sequences(itemandviews_grouped['itemid'])

In [24]:
len(items)

10000

### adding sequences that have 5 items or more

In [25]:
li=[x for x in items if len(x)>=5]

### the model takes the frist 5 items as input (x)

In [26]:
li[0][:-2]

[22838, 22839, 22840, 9385, 22841]

### and last 5 items as output (y)

In [27]:
li[0][1:-1]

[22839, 22840, 9385, 22841, 13960]

In [28]:
len(li)

9999

### training on 7999 sample and last 2000 samples for testing

In [29]:
train_items_feature=[]
for i in range(len(li)-2000):
    train_items_feature.append(li[i][:][:-2])

In [30]:
train_items_target=[]
for i in range(len(li)-2000):
    train_items_target.append(li[i][:][1:-1])

In [31]:
test_items_feature=[]
for i in range(len(li)-2000,len(li)):
    test_items_feature.append(li[i][:][:-2])

In [32]:
test_items_target=[]
for i in range(len(li)-2000,len(li)):
    test_items_target.append(li[i][:][1:-1])

### tokenizing the views

In [33]:
tokenizer_views = Tokenizer(num_words=num_words, oov_token=oov_token)

In [34]:
tokenizer_views.fit_on_texts(itemandviews_grouped['number of views'])

In [35]:
len(tokenizer_views.word_index)

60

In [36]:
number_views = tokenizer_views.texts_to_sequences(itemandviews_grouped['number of views'])

In [37]:
views=[x for x in number_views if len(x)>=5]

In [38]:
len(views)

9999

In [39]:
train_number_views=[]
for i in range(len(views)-2000):
    train_number_views.append(views[i][:][:-2])

In [40]:
test_number_views=[]
for i in range(len(views)-2000,len(views)):
    test_number_views.append(views[i][:][:-2])

### padding the sequences to  maxlen

In [41]:
train_number_views = pad_sequences(train_number_views, padding=pad_type, truncating=trunc_type, maxlen=5)

In [42]:
train_items_feature=pad_sequences(train_items_feature, padding=pad_type, truncating=trunc_type, maxlen=5)

In [43]:
train_items_target=pad_sequences(train_items_target, padding=pad_type, truncating=trunc_type, maxlen=5)

In [44]:
test_number_views = pad_sequences(test_number_views, padding=pad_type, truncating=trunc_type, maxlen=5)

In [45]:
test_items_feature=pad_sequences(test_items_feature, padding=pad_type, truncating=trunc_type, maxlen=5)

In [46]:
test_items_target=pad_sequences(test_items_target, padding=pad_type, truncating=trunc_type, maxlen=5)

In [47]:
model=model(len(tokenizer_views.word_index),len(tokenizer_items.word_index),5)

In [48]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 itemid (InputLayer)            [(None, 5)]          0           []                               
                                                                                                  
 views (InputLayer)             [(None, 5)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 5, 100)       5011300     ['itemid[0][0]']                 
                                                                                                  
 embedding_1 (Embedding)        (None, 5, 100)       6000        ['views[0][0]']                  
                                                                                              

In [49]:
train_feat_dict = {'itemid': train_items_feature,'views': train_number_views}
train_target_tensor = train_items_target
train_dataset, train_steps_per_epoch = create_train_tfdata(train_feat_dict,train_target_tensor,batch_size=512)

### training the dataset

In [50]:
history = model.fit(train_dataset,steps_per_epoch=train_steps_per_epoch,epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [51]:
y_pred=model.predict(x={'itemid':test_items_feature,'views':test_number_views})

In [52]:
mapk(test_items_target, y_pred)

0.0701