# An LSTM based approach, enriched with contents
RecSys Challenge 2022 - Group 5

## General Data Preparation

Let us read in the different datasets.

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
base_path_train = "~/shared/data/project/training"

items_df = pd.read_csv(os.path.join(base_path_train, "item_features.csv"))
purchase_df = pd.read_csv(os.path.join(base_path_train, "train_purchases.csv"))
session_df = pd.read_csv(os.path.join(base_path_train, "train_sessions.csv"))

In [3]:
items_df

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [4]:
items_df.item_id.nunique()

23691

In [5]:
purchase_df

Unnamed: 0,session_id,item_id,date
0,3,15085,2020-12-18 21:26:47.986
1,13,18626,2020-03-13 19:36:15.507
2,18,24911,2020-08-26 19:20:32.049
3,19,12534,2020-11-02 17:16:45.92
4,24,13226,2020-02-26 18:27:44.114
...,...,...,...
999995,4439986,2915,2021-05-13 11:56:37.464
999996,4439990,8786,2020-08-22 14:28:22.382
999997,4439994,21630,2020-11-27 20:10:28.961
999998,4439999,16962,2020-11-27 11:01:41.356


In [6]:
session_df

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
4743815,4440001,20409,2020-10-30 23:37:20.658
4743816,4440001,14155,2020-10-30 23:31:56.607
4743817,4440001,14303,2020-10-30 23:36:17.934
4743818,4440001,27852,2020-10-30 23:39:55.186


Now we combine the views inside a session and the purchases of this session in one dataframe, with the column `was_bought` indicating whether the item was only viewed or bought.

In [7]:
purchase_df_processed = purchase_df.copy()
purchase_df_processed["was_bought"] = True

session_df_processed = session_df.copy()
session_df_processed["was_bought"] = False
df_processed = pd.concat([purchase_df_processed, session_df_processed]).sort_values(["session_id", "date"])
df_processed

Unnamed: 0,session_id,item_id,date,was_bought
1,3,9655,2020-12-18 21:19:48.093,False
0,3,9655,2020-12-18 21:25:00.373,False
0,3,15085,2020-12-18 21:26:47.986,True
2,13,15654,2020-03-13 19:35:27.136,False
1,13,18626,2020-03-13 19:36:15.507,True
...,...,...,...,...
4743804,4440001,19539,2020-10-30 23:37:09.46,False
4743815,4440001,20409,2020-10-30 23:37:20.658,False
4743818,4440001,27852,2020-10-30 23:39:55.186,False
4743806,4440001,20449,2020-10-30 23:40:28.149,False


Now we denormalize the item features table, to have a more handy representation of the item features

In [8]:
items_processed_df = items_df.pivot_table(values='feature_value_id', index='item_id', columns='feature_category_id').reset_index()
items_processed_df.index.names = ['index']
items_processed_df.columns = ["item_id"] + [f"item_feature_{x}" for x in list(range(73))]
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,,,,,,,394.0,,,...,,,,,351.0,885.0,,,75.0,
1,3,,,889.0,618.0,605.0,,452.0,,,...,,521.0,,,14.0,592.0,,,75.0,544.0
2,4,,,793.0,618.0,605.0,,837.0,,,...,,521.0,,,373.0,538.0,,,75.0,544.0
3,7,,,,,,,536.0,,,...,,,,,739.0,592.0,,,75.0,
4,8,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,351.0,592.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,106.0,805.0,,,75.0,544.0
23687,28140,,53.0,,,,,,,,...,80.0,,,349.0,351.0,,,226.0,,544.0
23688,28141,461.0,,889.0,719.0,605.0,,2.0,,,...,,,,,379.0,499.0,,,75.0,544.0
23689,28142,,,,,,,619.0,,,...,,610.0,,,895.0,740.0,,,75.0,91.0


The item features can now be merged to the combined dataset with session views and purchases from above.
Also NULL values are filled by 0.

In [9]:
df_processed = df_processed.merge(items_processed_df, how="left", on="item_id")
df_processed["was_bought"] = df_processed["was_bought"].astype(float)
df_processed

Unnamed: 0,session_id,item_id,date,was_bought,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
0,3,9655,2020-12-18 21:19:48.093,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
1,3,9655,2020-12-18 21:25:00.373,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
2,3,15085,2020-12-18 21:26:47.986,1.0,,53.0,,,,,...,,,,349.0,97.0,,,,,544.0
3,13,15654,2020-03-13 19:35:27.136,0.0,,,,618.0,,766.0,...,,521.0,,,351.0,780.0,,,219.0,
4,13,18626,2020-03-13 19:36:15.507,1.0,,,793.0,618.0,605.0,,...,,,,,739.0,805.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5743815,4440001,19539,2020-10-30 23:37:09.46,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743816,4440001,20409,2020-10-30 23:37:20.658,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743817,4440001,27852,2020-10-30 23:39:55.186,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0
5743818,4440001,20449,2020-10-30 23:40:28.149,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0


In [10]:
items_processed_df = items_processed_df.fillna(0)
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,0.0,0.0,0.0,0.0,0.0,0.0,394.0,0.0,0.0,...,0.0,0.0,0.0,0.0,351.0,885.0,0.0,0.0,75.0,0.0
1,3,0.0,0.0,889.0,618.0,605.0,0.0,452.0,0.0,0.0,...,0.0,521.0,0.0,0.0,14.0,592.0,0.0,0.0,75.0,544.0
2,4,0.0,0.0,793.0,618.0,605.0,0.0,837.0,0.0,0.0,...,0.0,521.0,0.0,0.0,373.0,538.0,0.0,0.0,75.0,544.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,536.0,0.0,0.0,...,0.0,0.0,0.0,0.0,739.0,592.0,0.0,0.0,75.0,0.0
4,8,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,351.0,592.0,0.0,0.0,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,106.0,805.0,0.0,0.0,75.0,544.0
23687,28140,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,0.0,0.0,349.0,351.0,0.0,0.0,226.0,0.0,544.0
23688,28141,461.0,0.0,889.0,719.0,605.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,379.0,499.0,0.0,0.0,75.0,544.0
23689,28142,0.0,0.0,0.0,0.0,0.0,0.0,619.0,0.0,0.0,...,0.0,610.0,0.0,0.0,895.0,740.0,0.0,0.0,75.0,91.0


In [11]:
from sklearn.preprocessing import OneHotEncoder

to_encode = items_processed_df.drop("item_id", axis=1)

encoded = OneHotEncoder().fit_transform(to_encode)
print(to_encode.shape)
print(encoded.shape)

(23691, 73)
(23691, 1399)


In the next step, we encode the contents of the items (one hot encoding). This encoding will be used later for our embedding.

In [12]:
items_encoded = np.array(encoded.toarray())
items_encoded

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.]])

In [13]:
item_id2index = dict(zip(items_processed_df.item_id, items_processed_df.index))

In [14]:
all_items = list(items_processed_df["item_id"])

In [15]:
items_processed_array = np.array(items_processed_df.drop("item_id",axis=1))
items_processed_array[item_id2index[2]]

array([  0.,   0.,   0.,   0.,   0.,   0., 394.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,  38.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0., 123.,   0.,   0.,   0., 802.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0., 123.,   0.,   0.,  76.,   0.,   0.,   6.,   0.,   0.,
       365.,   0.,   0.,   0.,   0., 462., 801.,   0.,   0.,   0.,   0.,
         0., 351., 885.,   0.,   0.,  75.,   0.])

Next we read in the candidate items.

In [16]:
candidate_items = list(pd.read_csv("candidate_items.csv")["item_id"])
candidate_items[:10]

[4, 8, 9, 19, 20, 26, 33, 40, 51, 54]

## LSTM specific Data Preparation

### Item sequence preparation

Similarly to predicting the next word of a sentence, we can predict the "next item" in a session. For this, we see the viewed (but not purchased) items as a given sequence of input words, and the purchased items as output words. Now we want to transform the data into a form reflecting this idea.

First, let us prepare the training data for the LSTM. For this, we select a subset of the training dataset, so that RAM is not exceeded.

In [17]:
session_ids_cand = df_processed[(df_processed.was_bought==1) & (df_processed.item_id.isin(candidate_items))].session_id

In [18]:
#df_w2v = df_processed[df_processed.session_id < 10000].sort_values(["session_id", "date"])[["session_id", "item_id", "was_bought"]]
df_w2v = df_processed[(df_processed.session_id.isin(session_ids_cand)) & (df_processed.session_id < 50000)].sort_values(["session_id", "date"])[["session_id", "item_id", "was_bought"]]
df_w2v

Unnamed: 0,session_id,item_id,was_bought
3,13,15654,0.0
4,13,18626,1.0
42,31,25972,0.0
43,31,16289,0.0
44,31,2069,0.0
...,...,...,...
64212,49975,2072,0.0
64213,49975,2072,0.0
64214,49975,23565,1.0
64224,49993,19048,0.0


In the next step, we just map the item IDs to a subsequent list of numbers (without wholes)

In [19]:
item_list = list(df_w2v.item_id.sort_values().unique())
item_dict = {v: k for k, v in dict(zip(range(len(item_list)), item_list)).items()}
item_dict[4]

1

In [20]:
item_dict_rev = {v: k for k, v in item_dict.items()}

In the next step, we extract the viewed items (corresponding to the previous words) and purchased items (corresponding to the next words).

In [21]:
def get_values(x):
    x = x["item_id"]
    return [item_dict[x] for x in list(x.values.ravel())]

all_words = df_w2v.groupby('session_id').apply(get_values).to_list()

def get_values(x):
    x = x["item_id"]
    return [item_dict[x] for x in list(x.values.ravel())]

prev_words = df_w2v[(df_w2v.was_bought==0)].groupby('session_id').apply(get_values).to_list()

def get_values(x):
    x = x["item_id"]
    return item_dict[x.values.ravel()[0]]

next_words = df_w2v[(df_w2v.was_bought==1)].groupby('session_id').apply(get_values).to_list()

In [22]:
prev_words[:5]

[[3943],
 [6554, 4100, 519, 519, 6688, 63, 6983, 1105],
 [5219, 5698, 2622, 4410],
 [1614, 5014, 540, 3968, 5381, 4338],
 [5002, 2634, 4018, 5002, 2634, 4187, 5002, 4187]]

In [23]:
next_words[:5]

[4714, 2148, 2993, 5360, 6556]

In [24]:
unique_words = [item_dict[x] for x in df_w2v["item_id"].unique()]
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))
len(unique_words)

7130

Sessions do usually have different lengths. Thus we take the longest session as standard session length and pad the shorter sessions with zeros in the beginning, as follows:

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
max_seq_length = max([len(x) for x in prev_words])
input_seqs = np.array(pad_sequences(prev_words, maxlen=max_seq_length, padding='pre'))

print(max_seq_length)
print(input_seqs[:5])

90
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 3943]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0 6554 4100
   519  519 6688   63 6983 1105]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 

### Embedding Matrix Preparation with Contents

To reprensent each item by some feature vector, we use the item contents as an embedding.

In [27]:
embeddings_matrix = np.zeros((len(item_list), 1399))

for i, item_id in enumerate(item_list):
    idx = items_processed_df[items_processed_df.item_id == item_id].index[0]
    embedding_vector = items_encoded[idx]
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [28]:
embeddings_matrix

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [29]:
embeddings_matrix.shape

(7130, 1399)

## Defining and training the LSTM

Let us define our LSTM model.

In [30]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD

In [31]:
model = tf.keras.Sequential(
    [tf.keras.layers.Embedding(input_dim = len(item_list), output_dim=1399, weights=[embeddings_matrix], input_length=max_seq_length, trainable=False),
     tf.keras.layers.LSTM(256),
     tf.keras.layers.Dropout(0.2),
     tf.keras.layers.Dense(128, activation='relu'),
     tf.keras.layers.Dense(len(item_list) , activation='softmax')
    ])

opt = SGD(lr=10**(-6))
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

  super(SGD, self).__init__(name, **kwargs)


Also, we convert our output (i.e. the purchased items) to a categorical object.

In [32]:
next_words = np.array(next_words)
outputs = tf.keras.utils.to_categorical(next_words, num_classes=len(item_list))
outputs.shape

(5042, 7130)

Now, we train our model.

In [33]:
tf.config.run_functions_eagerly(True)

In [34]:
history = model.fit(input_seqs, outputs, epochs=10, validation_split=0.2, verbose=1, batch_size=256)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Prediction on test data

Let us read in the test data set.

In [35]:
import pandas as pd
import os
import numpy as np

base_path_test = "~/shared/data/project/test"


test_df = pd.read_csv(os.path.join(base_path_test, "test_sessions.csv"))

test_df

Unnamed: 0,session_id,item_id,date
0,126,9655,2020-12-18 21:25:00.373
1,126,9655,2020-12-18 21:19:48.093
2,3234,13214,2020-09-20 20:10:42.039
3,3234,13214,2020-09-20 20:11:53.966
4,3234,3173,2020-09-20 20:06:13.217
...,...,...,...
197619,186456690,10471,2021-05-02 17:41:18.737
197620,186456690,13385,2021-05-02 17:43:41.78
197621,186456690,10471,2021-05-02 17:40:10.625
197622,186456690,5382,2021-05-02 17:40:50.001


In order to not exceed the RAM, we split the test dataframe into sub-dataframes, which will be predicted batch by batch.

In [36]:
session_id_limits = list(range(0, test_df.session_id.max() + 10000000, 10000000))

In [37]:
test_dfs = []
for i in range(len(session_id_limits)-1):
    session_id_limit1 = session_id_limits[i]
    session_id_limit2 = session_id_limits[i+1]
    test_df_small = test_df[(test_df.session_id >= session_id_limit1) & (test_df.session_id < session_id_limit2)]
    test_dfs.append(test_df_small)

In [38]:
len(test_dfs)

19

In [39]:
def get_values(x):
    x = x["item_id"]
    return [item_dict[x] for x in list(x.values.ravel()) if x in item_dict.keys()]

In [40]:
candidate_items_keys = [item_dict[x] for x in candidate_items if x in item_dict.keys()]
candidate_items_keys[:4]

[1, 2, 4, 5]

Now we predict using our LSTM model. For this we need to prepare the data in the same fashion as above.
From the result, we extract the 100 items with the largest scores.

In [41]:
candidate_rank_dfs = []
j = 0
for test_df_small in test_dfs:
    print(j)
    j+=1
    
    # data preparation
    prev_words_test = test_df_small.groupby('session_id').apply(get_values).to_list()
    input_seqs_test = np.array(pad_sequences(prev_words_test, maxlen=max_seq_length, padding='pre'))
    
    # lstm prediction
    preds = model.predict(input_seqs_test)

    # sorting the predictions by highest score
    arr = preds.argsort().astype("float32")
    # retrieving only the candidate items
    cond = np.isin(arr.astype(int), candidate_items_keys)
    arr[~cond] = np.nan
    arrlist = arr.tolist()
    
    # creating the ranks
    for i in range(len(test_df_small.session_id.unique())):
        rank_dict = {}
        session_id  = test_df_small.session_id.unique()[i]

        # get the scores from the lecture
        scores = arrlist[i]
        y = [x for x in scores if x>=0 and x in item_dict.keys()] # x>=0 removes the nan values
        ranked_list = [item_dict_rev[x] for x in y[-100:]] # get the top 100 items

        for num, item in enumerate(ranked_list):
            rank_dict[item] = len(ranked_list) - num # this is just mapping the item to its rank
        
        candidate_rank_df = pd.DataFrame(rank_dict.items(), columns = ["item_id", "rank"]).sort_values("rank")
        candidate_rank_df["session_id"] = session_id
        candidate_rank_dfs.append(candidate_rank_df)
    
candidate_rank_df = pd.concat(candidate_rank_dfs)
candidate_rank_df = candidate_rank_df[["session_id", "item_id", "rank"]].reset_index(drop=True)
candidate_rank_df

0




1




2




3




4




5




6




7




8




9




10




11




12




13




14




15




16




17




18




Unnamed: 0,session_id,item_id,rank
0,126,14447,1
1,126,25972,2
2,126,5429,3
3,126,16767,4
4,126,27409,5
...,...,...,...
4999995,186479748,11649,96
4999996,186479748,17935,97
4999997,186479748,16441,98
4999998,186479748,7081,99


In [42]:
candidate_rank_df

Unnamed: 0,session_id,item_id,rank
0,126,14447,1
1,126,25972,2
2,126,5429,3
3,126,16767,4
4,126,27409,5
...,...,...,...
4999995,186479748,11649,96
4999996,186479748,17935,97
4999997,186479748,16441,98
4999998,186479748,7081,99


Finally, we write the result to a csv file.

In [43]:
candidate_rank_df.to_csv("results_lstm_with_contents.csv", index=False)

## Final checks

Number of predicted sessions.

In [44]:
len(candidate_rank_df.session_id.unique())

50000

Number of recommendations of non-candidate items. Should be 0.

In [45]:
(~candidate_rank_df.item_id.isin(candidate_items)).sum()

0

Number of sessions to predict.

In [46]:
len(test_df.session_id.unique())

50000

List of items ranked first at least once.

In [47]:
candidate_rank_df[candidate_rank_df["rank"] == 1].item_id.unique()

array([14447, 25972,  5429, 15665, 25650, 25154, 16767, 12993, 14130,
       15533,  7625, 23626, 24229,  1799,  8829, 26786, 19735, 27952,
       18832,  1916, 28058, 26902,  2814, 23946,  5138,  1219,  3955])