In [4]:
# Import libraries
import os
import re
import time
import json
import random
import string
import cProfile
import itertools
import pyinstrument


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow as pa


from tqdm import tqdm
from typing import Tuple, List
from joblib import Parallel, delayed
from itertools import combinations
from collections import Counter, defaultdict
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics import label_ranking_average_precision_score
from concurrent.futures import ThreadPoolExecutor


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Import data_processor script
from data_processor import load_and_process_data


# Housekeeping
task = 'task1'
output_path = '../../outputs/'
final_output_file = output_path + task + '_predictions.parquet'
prod_rec = 20


# Memory management; set to None for full dataframe
data_path = '../../data/'
products_slice = 80000
sessions_slice = 80000
test_slice = 80000


# Load and process data
products_train, sessions_train, sessions_test = load_and_process_data(data_path, products_slice, sessions_slice, test_slice, task)

In [6]:
products_train.shape, sessions_train.shape, sessions_test.shape

((80000, 11), (80000, 3), (80000, 2))

In [7]:
products_train.head(2)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,red dragon amberjack 3 steel tip 22 gramm wolf...,30.95,red dragon,unknown,unknown,rdd0089,unknown,unknown,amberjacks steel dartpfeile sind verf gbar in ...
1,B08PRYN6LD,DE,simply keto lower carb schokodrops ohne zucker...,17.9,simply keto,unknown,750 g 1er pack,unknown,unknown,unknown,nat rliche s sse durch erythrit wir stellen oh...


In [8]:
sessions_train.head(2)

Unnamed: 0,prev_items,next_item,locale
0,"B09W9FND7K,B09JSPLN1M",B09M7GY217,DE
1,"B076THCGSG,B007MO8IME,B08MF65MLV,B001B4TKA0",B001B4THSA,DE


In [9]:
sessions_test.head(2)

Unnamed: 0,prev_items,locale
0,"B08V12CT4C,B08V1KXBQD,B01BVG1XJS,B09VC5PKN5,B0...",DE
1,"B00R9R5ND6,B00R9RZ9ZS,B00R9RZ9ZS",DE


### Recommendations using Co-Occurrences
This recommendation system is based on product co-occurrence matrix. The matrix is created by analyzing the previous items purchased by customers in their sessions. The matrix is then used to recommend items that are frequently purchased together.

In [18]:
# Function to create co-occurrence matrix
def cooccurrence_matrix(df: pd.DataFrame) -> Tuple[csr_matrix, List[str]]:
    sessions = df['prev_items'].apply(lambda x: x.split(',')).tolist()
    product_to_index = {}
    index_to_product = []
    data, row, col = [], [], []

    sorted_pairs = [tuple(sorted(pair)) for session in sessions for pair in itertools.combinations(session, 2)]

    for pair in sorted_pairs:
        for idx, product in enumerate(pair):
            if product not in product_to_index:
                product_to_index[product] = len(index_to_product)
                index_to_product.append(product)
            (row if idx == 0 else col).append(product_to_index[product])
        data.append(1)

    cooccurrence_sparse = coo_matrix((data, (row, col)), shape=(len(index_to_product), len(index_to_product))).tocsr()
    return cooccurrence_sparse, index_to_product


# Function to save recommendations
def reco_saver(recommendation_function, cooccurrence_sparse: csr_matrix, index_to_product: List[str], prod_rec: int, output_path: str) -> None:
    recos = {product_id: recommendation_function(product_id, cooccurrence_sparse, index_to_product, top_n=prod_rec).to_dict(orient='records')
             for product_id in tqdm(index_to_product, desc=f'Saving recommendations for {len(index_to_product)} products')}
    
    recos_df = pd.DataFrame([(key, rec['related_product'], rec['score']) for key, records in recos.items() for rec in records], columns=['product_id', 'related_product', 'score'])
    recos_df = recos_df.groupby('product_id').apply(lambda x: x.nlargest(prod_rec, 'score')['related_product'].tolist()).reset_index(name='next_item')
    
    recos_df.to_parquet(output_path)


# Function to calculate MRR
def mrr_from_parquet(recommendation_parquet_file: str, session_test: pd.DataFrame, prod_rec: int) -> float:
    reciprocal_ranks = []

    recos_df = pd.read_parquet(recommendation_parquet_file, engine='pyarrow')
    recos_dict = recos_df.set_index('product_id')['next_item'].to_dict()

    for _, (prev_items, ) in session_test[['prev_items']].iterrows():
        session_products = prev_items.split(',')
        ground_truth_product = session_products[-2] if len(session_products) > 1 else None
        last_product = session_products[-1]

        if ground_truth_product and last_product in recos_dict:
            recommendations = list(recos_dict[last_product])
            if ground_truth_product in recommendations:
                rank = recommendations.index(ground_truth_product)
                reciprocal_rank = 1 / (rank + 1)
            else:
                reciprocal_rank = 0
            reciprocal_ranks.append(reciprocal_rank)

    mean_reciprocal_rank = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0
    return mean_reciprocal_rank

##### Models
We define our model here.

In [19]:
# Recommendation Model 1: this one is based on the co-occurrence matrix
def model1(product_id: str, cooccurrence_sparse: csr_matrix, index_to_product: List[str], top_n: int = 10) -> pd.DataFrame:
    product_to_index = {product: index for index, product in enumerate(index_to_product)}
    
    if product_id not in product_to_index:
        return pd.DataFrame(columns=['related_product', 'score'])

    product_index = product_to_index[product_id]
    product_scores = cooccurrence_sparse[product_index]
    top_indices = np.argsort(-product_scores.toarray().flatten())[:top_n+1]

    recommendations = pd.DataFrame({
        'related_product': [index_to_product[i] for i in top_indices],
        'score': product_scores[0, top_indices].toarray().flatten()
    })

    recommendations = recommendations[recommendations['related_product'] != product_id].head(top_n)
    
    return recommendations

##### Implementation

In [20]:
pre_time = time.time()
# Create co-occurrence matrix
train_cooccurrence, index_to_product = cooccurrence_matrix(sessions_train)
post_time = time.time()
print(f'Creating co-occurrence matrix took: {post_time - pre_time:.4f} seconds')


# Get recommendations for a single product for a specified model
prod_id = 'B08QYYBTMC'
model = model1
pre_time = time.time()
recommendations = model(prod_id, train_cooccurrence, index_to_product, top_n=prod_rec)
post_time = time.time()
print(f'Getting recommendations took: {post_time - pre_time:.4f} seconds')
print(f'Top {prod_rec} recommendations for {prod_id} using {model.__name__}:')
recommendations

Creating co-occurrence matrix took: 1.1673 seconds
Getting recommendations took: 0.0205 seconds
Top 20 recommendations for B08QYYBTMC using model1:


Unnamed: 0,related_product,score
0,B0B3DKVCC6,11
1,B0BHHZ9LPT,8
2,B08V1KXBQD,7
3,B08V12CT4C,6
4,B099NS1XPG,5
6,B095C1CHMQ,5
7,B09V7KG931,4
8,B09NKFSLGB,4
9,B091FGXDSX,3
10,B09P3K5778,3


In [21]:
# Save recommendations to parquet file
pre_time = time.time()
reco_saver(model, train_cooccurrence, index_to_product, prod_rec, final_output_file)
post_time = time.time()
print(f'Saving recommendations took: {post_time - pre_time:.4f} seconds')

Saving recommendations for 153999 products: 100%|██████████| 153999/153999 [52:01<00:00, 49.33it/s]


Saving recommendations took: 3204.0224 seconds


In [22]:
# Calculate MRR
pre_time = time.time()
mrr = mrr_from_parquet(final_output_file, sessions_test, prod_rec)
post_time = time.time()
print(f'MRR ({model.__name__}): {mrr}')
print(f'Calculating MRR took: {post_time - pre_time:.4f} seconds')

MRR (model1): 0.11724375545111904
Calculating MRR took: 2.9466 seconds
