# A content based approach
RecSys Challenge 2022 - Group 5

## General Data Preparation

Let us read in the different datasets.

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
base_path_train = "~/shared/data/project/training"

items_df = pd.read_csv(os.path.join(base_path_train, "item_features.csv"))
purchase_df = pd.read_csv(os.path.join(base_path_train, "train_purchases.csv"))
session_df = pd.read_csv(os.path.join(base_path_train, "train_sessions.csv"))

In [3]:
items_df

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [4]:
items_df.item_id.nunique()

23691

In [5]:
purchase_df

Unnamed: 0,session_id,item_id,date
0,3,15085,2020-12-18 21:26:47.986
1,13,18626,2020-03-13 19:36:15.507
2,18,24911,2020-08-26 19:20:32.049
3,19,12534,2020-11-02 17:16:45.92
4,24,13226,2020-02-26 18:27:44.114
...,...,...,...
999995,4439986,2915,2021-05-13 11:56:37.464
999996,4439990,8786,2020-08-22 14:28:22.382
999997,4439994,21630,2020-11-27 20:10:28.961
999998,4439999,16962,2020-11-27 11:01:41.356


In [6]:
session_df

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
4743815,4440001,20409,2020-10-30 23:37:20.658
4743816,4440001,14155,2020-10-30 23:31:56.607
4743817,4440001,14303,2020-10-30 23:36:17.934
4743818,4440001,27852,2020-10-30 23:39:55.186


Now we combine the views inside a session and the purchases of this session in one dataframe, with the column `was_bought` indicating whether the item was only viewed or bought.

In [7]:
purchase_df_processed = purchase_df.copy()
purchase_df_processed["was_bought"] = True

session_df_processed = session_df.copy()
session_df_processed["was_bought"] = False
df_processed = pd.concat([purchase_df_processed, session_df_processed]).sort_values(["session_id", "date"])
df_processed

Unnamed: 0,session_id,item_id,date,was_bought
1,3,9655,2020-12-18 21:19:48.093,False
0,3,9655,2020-12-18 21:25:00.373,False
0,3,15085,2020-12-18 21:26:47.986,True
2,13,15654,2020-03-13 19:35:27.136,False
1,13,18626,2020-03-13 19:36:15.507,True
...,...,...,...,...
4743804,4440001,19539,2020-10-30 23:37:09.46,False
4743815,4440001,20409,2020-10-30 23:37:20.658,False
4743818,4440001,27852,2020-10-30 23:39:55.186,False
4743806,4440001,20449,2020-10-30 23:40:28.149,False


Now we denormalize the item features table, to have a more handy representation of the item features

In [8]:
items_processed_df = items_df.pivot_table(values='feature_value_id', index='item_id', columns='feature_category_id').reset_index()
items_processed_df.index.names = ['index']
items_processed_df.columns = ["item_id"] + [f"item_feature_{x}" for x in list(range(73))]
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,,,,,,,394.0,,,...,,,,,351.0,885.0,,,75.0,
1,3,,,889.0,618.0,605.0,,452.0,,,...,,521.0,,,14.0,592.0,,,75.0,544.0
2,4,,,793.0,618.0,605.0,,837.0,,,...,,521.0,,,373.0,538.0,,,75.0,544.0
3,7,,,,,,,536.0,,,...,,,,,739.0,592.0,,,75.0,
4,8,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,351.0,592.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,106.0,805.0,,,75.0,544.0
23687,28140,,53.0,,,,,,,,...,80.0,,,349.0,351.0,,,226.0,,544.0
23688,28141,461.0,,889.0,719.0,605.0,,2.0,,,...,,,,,379.0,499.0,,,75.0,544.0
23689,28142,,,,,,,619.0,,,...,,610.0,,,895.0,740.0,,,75.0,91.0


The item features can now be merged to the combined dataset with session views and purchases from above.
Also NULL values are filled by 0.

In [9]:
df_processed = df_processed.merge(items_processed_df, how="left", on="item_id")
df_processed["was_bought"] = df_processed["was_bought"].astype(float)
df_processed

Unnamed: 0,session_id,item_id,date,was_bought,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
0,3,9655,2020-12-18 21:19:48.093,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
1,3,9655,2020-12-18 21:25:00.373,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
2,3,15085,2020-12-18 21:26:47.986,1.0,,53.0,,,,,...,,,,349.0,97.0,,,,,544.0
3,13,15654,2020-03-13 19:35:27.136,0.0,,,,618.0,,766.0,...,,521.0,,,351.0,780.0,,,219.0,
4,13,18626,2020-03-13 19:36:15.507,1.0,,,793.0,618.0,605.0,,...,,,,,739.0,805.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5743815,4440001,19539,2020-10-30 23:37:09.46,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743816,4440001,20409,2020-10-30 23:37:20.658,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743817,4440001,27852,2020-10-30 23:39:55.186,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0
5743818,4440001,20449,2020-10-30 23:40:28.149,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0


In [10]:
items_processed_df = items_processed_df.fillna(0)
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,0.0,0.0,0.0,0.0,0.0,0.0,394.0,0.0,0.0,...,0.0,0.0,0.0,0.0,351.0,885.0,0.0,0.0,75.0,0.0
1,3,0.0,0.0,889.0,618.0,605.0,0.0,452.0,0.0,0.0,...,0.0,521.0,0.0,0.0,14.0,592.0,0.0,0.0,75.0,544.0
2,4,0.0,0.0,793.0,618.0,605.0,0.0,837.0,0.0,0.0,...,0.0,521.0,0.0,0.0,373.0,538.0,0.0,0.0,75.0,544.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,536.0,0.0,0.0,...,0.0,0.0,0.0,0.0,739.0,592.0,0.0,0.0,75.0,0.0
4,8,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,351.0,592.0,0.0,0.0,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,106.0,805.0,0.0,0.0,75.0,544.0
23687,28140,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,0.0,0.0,349.0,351.0,0.0,0.0,226.0,0.0,544.0
23688,28141,461.0,0.0,889.0,719.0,605.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,379.0,499.0,0.0,0.0,75.0,544.0
23689,28142,0.0,0.0,0.0,0.0,0.0,0.0,619.0,0.0,0.0,...,0.0,610.0,0.0,0.0,895.0,740.0,0.0,0.0,75.0,91.0


In [11]:
item_id2index = dict(zip(items_processed_df.item_id, items_processed_df.index))

In [12]:
all_items = list(items_processed_df["item_id"])

In [13]:
items_processed_array = np.array(items_processed_df.drop("item_id",axis=1))
items_processed_array[item_id2index[2]]

array([  0.,   0.,   0.,   0.,   0.,   0., 394.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,  38.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0., 123.,   0.,   0.,   0., 802.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0., 123.,   0.,   0.,  76.,   0.,   0.,   6.,   0.,   0.,
       365.,   0.,   0.,   0.,   0., 462., 801.,   0.,   0.,   0.,   0.,
         0., 351., 885.,   0.,   0.,  75.,   0.])

In [14]:
items_processed_array[item_id2index[2]].shape

(73,)

Next we read in the candidate items.

In [15]:
candidate_items = list(pd.read_csv("candidate_items.csv")["item_id"])
candidate_items[:10]

[4, 8, 9, 19, 20, 26, 33, 40, 51, 54]

## Recommender using a content based approach

In this notebook, the goal will be to recommend the most similar items to the ones seen in the session.
For this, we first define our distance function between two items. The computed distance is simply the number of non-equal features (as the feature values are only categorical).

In [16]:
def item_dist(item_id1, item_id2):
    item1_row = items_processed_array[item_id2index[item_id1]]
    item2_row = items_processed_array[item_id2index[item_id2]]
    
    diff = item1_row - item2_row
    dist = np.sum(diff != 0)
    return dist

In [17]:
item_dist(3,4)

12

In [18]:
(items_processed_array[item_id2index[3]] != items_processed_array[item_id2index[4]]).sum()

12

In [19]:
items_processed_array[item_id2index[4]]

array([  0.,   0., 793., 618., 605.,   0., 837.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0., 378., 289., 148.,   0.,   0., 881.,
         0.,   0.,   0., 268.,   0.,   0.,   0.,   0.,   0., 902.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
       559., 825., 218.,   0.,   0., 317.,   0.,   0.,   0.,   0., 267.,
       365.,   0.,   0., 180.,   0., 462.,   0., 861.,   0., 521.,   0.,
         0., 373., 538.,   0.,   0.,  75., 544.])

Now we precomute the distances for each pair `(item, candidate_item)` and store it in a dictionary `item_dist_dict`.

In [20]:
def precompute_distances():
    item_dist_dict = {}
    for candidate_item in candidate_items:
        for item in all_items:
            item_dist_dict[(item, candidate_item)] = item_dist(item, candidate_item)
    return item_dist_dict

item_dist_dict = precompute_distances()

## Prediction on test data

Let us read in the test data set.

In [21]:
import pandas as pd
import os
import numpy as np

base_path_test = "~/shared/data/project/test"


test_df = pd.read_csv(os.path.join(base_path_test, "test_sessions.csv"))

test_df

Unnamed: 0,session_id,item_id,date
0,126,9655,2020-12-18 21:25:00.373
1,126,9655,2020-12-18 21:19:48.093
2,3234,13214,2020-09-20 20:10:42.039
3,3234,13214,2020-09-20 20:11:53.966
4,3234,3173,2020-09-20 20:06:13.217
...,...,...,...
197619,186456690,10471,2021-05-02 17:41:18.737
197620,186456690,13385,2021-05-02 17:43:41.78
197621,186456690,10471,2021-05-02 17:40:10.625
197622,186456690,5382,2021-05-02 17:40:50.001


Now we define a function which recommends the top 100 candidate items for a given session.

To do so, for each candidate item and each item in the session, the distance is retrieved from the `item_dist_dict`. In the end, all those distances will be summed up for the session. Thus, if a session has `k` items, we retrieve for each candidate item `k` distances and take the sum of these `k` distances. This way, we get a distance between the session and each candidate item.

Finally we order the candidate items by ascending distance between session and candidate item. Thus, we will get the closest items to that session.

In [22]:
def compute_session_prediction_test(session_id):
    session_df_trial = test_df[test_df.session_id==session_id]
    items_in_session = list(session_df_trial["item_id"])
    items_in_session
    candidate_rank_dict = {}
    for candidate_item in candidate_items:
        distance = 0
        for item_in_session in items_in_session:
            distance += item_dist_dict[(item_in_session, candidate_item)]
        candidate_rank_dict[candidate_item] = distance

    candidate_rank_df = pd.DataFrame(candidate_rank_dict.items(), columns = ["item_id", "score"])
    candidate_rank_df = candidate_rank_df.sort_values("score", ascending=True).head(100).reset_index(drop=True)
    candidate_rank_df["rank"] = candidate_rank_df.index + 1
    candidate_rank_df["session_id"] = session_id
    candidate_rank_df = candidate_rank_df.drop("score", axis=1)
    
    return candidate_rank_df

To finally perform our prediction on the test dataset, we iterate over all sessions and perform the ranking described above.

In [23]:
def predict_cb():
    session_ids = test_df.session_id.unique()
    
    out_df = compute_session_prediction_test(session_ids[0])
    for session_id in session_ids[1:]:
        candidate_rank_df = compute_session_prediction_test(session_id)
        out_df = pd.concat([out_df, candidate_rank_df])
    
    return out_df[["session_id", "item_id", "rank"]]

In [24]:
from datetime import datetime

Let us run the prediction and write the results to a csv file..

In [25]:
print(datetime.now())
out_df = predict_cb()
print(datetime.now())
out_df

2022-07-10 06:57:40.768887
2022-07-10 07:37:56.963874


Unnamed: 0,session_id,item_id,rank
0,126,27603,1
1,126,25643,2
2,126,903,3
3,126,9026,4
4,126,20741,5
...,...,...,...
95,186479748,5138,96
96,186479748,881,97
97,186479748,2210,98
98,186479748,4311,99


In [26]:
out_df.to_csv("results_content_based.csv", index=False)