In [None]:
# Import python libraries
import os
import time
import importlib

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

from collections import defaultdict
from IPython.display import clear_output
from typing import Any, Dict, List, Set, Tuple
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Housekeeping ==> THIS GENERALLY STAYS THE SAME
SEED, NUM_RECOMMENDATIONS = 183, 100
task, task1_locales = 'task1', ['DE', 'JP', 'UK']
NUM_CORES, NUM_THREADS = 8, 16
data_path, output_path = '../../data/', '../../outputs/'
train_path, test_path = data_path + 'train/', data_path + 'test/'
output_file = output_path + task + '_predictions.parquet'
project_files = ['products_train.csv', 'sessions_train.csv', f'sessions_test_{task}.csv']
prod_dtypes = {
    'id': 'object',
    'locale': 'object',
    'title': 'object',
    'price': 'float64',
    'brand': 'object',
    'color': 'object',
    'size': 'object',
    'model': 'object',
    'material': 'object',
    'author': 'object',
    'desc': 'object'
}
sess_dtypes = {
    'session_id': 'int32'
}

In [None]:
# Import project modules
import data_processor
importlib.reload(data_processor)
from data_processor import handle_data

import data_utils
importlib.reload(data_utils)
from data_utils import split_locales, split_locales_and_save, split_sessions, analyze_sessions, view_common_products

import misc_utils
importlib.reload(misc_utils)
from misc_utils import clear_memory

In [None]:
# LOAD DATA HERE ==> THIS GENERALLY STAYS THE SAME
products_train, sessions_train, sessions_test = handle_data(
    project_files, [train_path, train_path, test_path],
    task, task1_locales, 1, SEED, prod_dtypes, sess_dtypes
)

In [None]:
products_train.shape, sessions_train.shape

In [None]:
# Split datasets by locale
p_train_de, p_train_jp, p_train_uk = split_locales(products_train, task1_locales)
s_train_de, s_train_jp, s_train_uk = split_locales(sessions_train, task1_locales)

In [None]:
p_train_de.shape, s_train_de.shape

In [None]:
# Slice dataframes to get the first n rows
slices = 1000
p_train_de, s_train_de = p_train_de[:slices], s_train_de[:slices]

In [None]:
p_train_de.shape, s_train_de.shape

In [None]:
# Clean up memory
keep_vars = [p_train_de, s_train_de, sessions_test]
clear_memory(keep_vars, globals_dict=globals())

In [None]:
# Split sessions
size_val = 0.15
size_test = 0.15
s_train_de, s_val_de, s_test_de = split_sessions(s_train_de, size_val, size_test)

In [None]:
s_train_de.shape, s_val_de.shape, s_test_de.shape

## Item-based Collaborative Filtering
Item-based collaborative filtering is a recommendation technique that uses the similarities between items to recommend similar products to users. For our case, item-based collaborative filtering can be used to recommend products based on their similarity in terms of features like product title, description, brand, or price. The intuition is that if a user engaged with a particular item, they are more likely to engage with similar items.

<img src='../../img/cf.png' width=600>

**Model 1**

This is an item-based collaborative filtering model that combines product features to create a content-based representation of each product. The model computes item-item similarity using the cosine similarity of the TF-IDF vectors of the combined features. The resulting similarity matrix is then used to create a dictionary of item similarities, which can be used to recommend similar items to a given item.

In [None]:
# # MODEL 1
# def item_based_cf_model_1(
#     products_train: dd.DataFrame, 
#     sessions_train: dd.DataFrame, 
#     hyperparams: Dict[str, Any], 
#     comb_features: List[str]
# ) -> Dict[str, Dict[str, float]]:

#     # Combine features from comb_features
#     def combine_features(row: pd.Series, comb_features: List[str]) -> str:
#         combined_features = ""
#         for feature in comb_features:
#             if feature in row and not pd.isnull(row[feature]):
#                 combined_features += " " + row[feature]
#         return combined_features.strip()

#     # Compute item features
#     products_train["combined_features"] = products_train.apply(combine_features, axis=1, meta=('combined_features', 'object'), comb_features=comb_features)
#     item_features = products_train[["id", "combined_features"]].compute()

#     # Compute item-item similarity matrix using the TF-IDF vectorizer
#     vectorizer = TfidfVectorizer(
#         min_df=hyperparams.get("min_df", 2),
#         max_df=hyperparams.get("max_df", 0.8),
#         ngram_range=hyperparams.get("ngram_range", (1, 3))
#     )
#     combined_features_vectors = vectorizer.fit_transform(item_features["combined_features"])
#     similarity_matrix = cosine_similarity(combined_features_vectors)

#     # Create item-item similarity dictionary
#     item_similarity_dict = defaultdict(dict)
#     for i in range(len(item_features)):
#         for j in range(len(item_features)):
#             item_id_1 = item_features.iloc[i]["id"]
#             item_id_2 = item_features.iloc[j]["id"]
#             item_similarity_dict[item_id_1][item_id_2] = similarity_matrix[i, j]

#     return item_similarity_dict

**Model 2**

Model 2 is similar to Model 1 in that it combines product features to create a content-based representation of each product and computes item-item similarity using cosine similarity of the TF-IDF vectors. However, Model 2 introduces a key difference: it sorts the recommendations by similarity score and keeps only the top N most similar items for each item. This approach makes the recommendations more focused, keeping only the most relevant similar items for each given item.

In [None]:
# # MODEL 2
# def item_based_cf_model_2(
#     products_train: dd.DataFrame, 
#     sessions_train: dd.DataFrame, 
#     hyperparams: Dict[str, Any], 
#     top_n: int
# ) -> Dict[str, Dict[str, float]]:

#     # Combine features from hyperparams['feat_combine']
#     def combine_features(row: pd.Series) -> str:
#         comb_features = hyperparams['feat_combine']
#         combined_features = ""
#         for feature in comb_features:
#             if feature in row and not pd.isnull(row[feature]):
#                 combined_features += " " + row[feature]
#         return combined_features.strip()

#     # Compute item features
#     products_train["combined_features"] = products_train.apply(combine_features, axis=1, meta=('combined_features', 'object'))
#     item_features = products_train[["id", "combined_features"]].compute()

#     # Compute item-item similarity matrix using the TF-IDF vectorizer
#     vectorizer = TfidfVectorizer(
#         min_df=hyperparams.get("min_df", 2),
#         max_df=hyperparams.get("max_df", 0.8),
#         ngram_range=hyperparams.get("ngram_range", (1, 3))
#     )
#     combined_features_vectors = vectorizer.fit_transform(item_features["combined_features"])
#     similarity_matrix = cosine_similarity(combined_features_vectors)

#     # Create item-item similarity dictionary
#     item_similarity_dict = defaultdict(dict)
#     for i in range(len(item_features)):
#         for j in range(len(item_features)):
#             item_id_1 = item_features.iloc[i]["id"]
#             item_id_2 = item_features.iloc[j]["id"]
#             item_similarity_dict[item_id_1][item_id_2] = similarity_matrix[i, j]

#         # Sort the recommendations by similarity score and take the top_n most similar items
#         sorted_recommendations = sorted(item_similarity_dict[item_id_1].items(), key=lambda x: x[1], reverse=True)[:top_n + 1]
#         item_similarity_dict[item_id_1] = dict(sorted_recommendations[1:])  # Exclude the first item (product itself)

#     return item_similarity_dict

**Model 11**

Final improved version for item-based cf. Implemented batches to reduce memory footprint.

In [None]:
# Function to save the item similarity
def save_item_similarity(item_similarity_dict: Dict[str, Dict[str, float]], output_path: str) -> None:
    with open(output_path, "a") as f:
        for item_id, similarity in item_similarity_dict.items():
            f.write(f"{item_id}:{','.join([f'{k}:{v}' for k, v in similarity.items()])}\n")

# Function(s) to compute item similarity (incrementally)
def compute_incremental_similarity(args: Tuple[int, str, np.ndarray, List[str], int]):
    i, item_id_1, similarity_matrix, item_ids, top_n = args
    sorted_idx = np.argsort(similarity_matrix[i, :])[-(top_n + 1):][::-1]
    sorted_recommendations = {item_ids[j]: similarity_matrix[i, j] for j in sorted_idx if j != i}
    return item_id_1, sorted_recommendations

def compute_item_similarity_incremental(combined_features_vectors: np.ndarray, item_ids: List[str], top_n: int, batch_size: int, output_path: str = None):
    n_items = combined_features_vectors.shape[0]
    item_similarity_dict = defaultdict(dict)
    total_batches = (n_items + batch_size - 1) // batch_size
    batch_messages = []

    for start_idx in range(0, n_items, batch_size):
        end_idx = min(start_idx + batch_size, n_items)
        batch_similarity_matrix = cosine_similarity(combined_features_vectors[start_idx:end_idx], combined_features_vectors)
        batch_item_ids = item_ids[start_idx:end_idx]

        # Print batch info
        batch_number = start_idx // batch_size + 1
        batch_message = f"Processing batch {batch_number} out of {total_batches}."
        batch_messages.append(batch_message)

        # Clear the output and display only the last 5 batch messages
        clear_output(wait=True)
        for msg in batch_messages[-5:]:
            print(msg)

        with ThreadPoolExecutor() as executor:
            args = [(i, item_id_1, batch_similarity_matrix, item_ids, top_n) for i, item_id_1 in enumerate(batch_item_ids)]
            results = executor.map(compute_incremental_similarity, args)

            for item_id_1, sorted_recommendations in results:
                item_similarity_dict[item_id_1] = sorted_recommendations
            
            if output_path:  # Save item similarity to the output file
                save_item_similarity(item_similarity_dict, output_path)
                item_similarity_dict.clear()  # Clear the current batch similarity from memory

    return item_similarity_dict

def item_based_cf_model_11(
    products_train: pd.DataFrame, 
    sessions_train: pd.DataFrame, 
    hyperparams: Dict[str, Any], 
    top_n: int,
    batch_size: int,
    model_save: bool = False,
    output_path: str = None
) -> Dict[str, Dict[str, float]]:
    # Get unique products based on the 'incl_prod' parameter
    def get_unique_products(
        products_train: pd.DataFrame, 
        sessions_train: pd.DataFrame, 
        incl_prod: str
    ) -> Set[str]:
        if incl_prod == 'all':
            core_item_set = set(products_train['id'].unique()) | set(pd.Series(sessions_train['prev_items'].str.split(',').sum()).unique())
        elif incl_prod == 'prod_only':
            core_item_set = set(products_train['id'].unique())
        elif incl_prod == 'sess_only':
            core_item_set = set(pd.Series(sessions_train['prev_items'].str.split(',').sum()).unique())
        else:
            raise ValueError("Invalid value for 'incl_prod'. Choose from 'all', 'prod_only', or 'sess_only'.")
        return core_item_set

    # Combine features from hyperparams['feat_combine']
    def combine_features(row: pd.Series) -> str:
        comb_features = hyperparams['feat_combine']
        combined_features = ""
        for feature in comb_features:
            if feature in row and not pd.isnull(row[feature]):
                combined_features += " " + row[feature]
        return combined_features.strip()
    
    timer_1 = time.time()
    core_item_set = get_unique_products(products_train, sessions_train, hyperparams['incl_prod'])
    timer_2 = time.time()
    print(f"Time to get unique products: {timer_2 - timer_1:.2f} seconds.")

    if hyperparams['incl_prod'] == 'sess_only':
        missing_product_ids = core_item_set.difference(products_train['id'].unique())
        missing_products = pd.DataFrame({'id': list(missing_product_ids)})

        missing_products['title'] = ''
        missing_products['brand'] = ''
        missing_products['color'] = ''
        missing_products['size'] = ''
        missing_products['model'] = ''
        missing_products['material'] = ''
        missing_products['author'] = ''
        missing_products['desc'] = ''

        products_train = pd.concat([products_train, missing_products], ignore_index=True)

    products_train = products_train[products_train['id'].isin(core_item_set)]

    timer_3 = time.time()
    # Compute item features
    products_train["combined_features"] = products_train.apply(combine_features, axis=1)
    item_features = products_train[["id", "combined_features"]]
    item_ids = item_features['id'].values
    timer_4 = time.time()
    print(f"Time to compute item features: {timer_4 - timer_3:.2f} seconds.")

    if len(item_features) == 0:
        print("Error: item_features is empty.")
        return {}

    # Compute item-item similarity matrix using the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        min_df=hyperparams.get("min_df", 2),
        max_df=hyperparams.get("max_df", 0.8),
        ngram_range=hyperparams.get("ngram_range", (1, 3))
    )
    try:
        timer_5 = time.time()
        combined_features_vectors = vectorizer.fit_transform(item_features["combined_features"])
        timer_6 = time.time()
        print(f"Time to compute feature vectors with TF-IDF: {timer_6 - timer_5:.2f} seconds.")
    except ValueError:
        print("Error: After pruning, no terms remain. Try a lower min_df or a higher max_df.")
        return {}
    
    timer_7 = time.time()
    if not model_save:  # Only return actual item similarity if model_save is False (to help with debugging)
        item_similarity_dict = compute_item_similarity_incremental(combined_features_vectors, item_ids, top_n, batch_size)
    else:
        if output_path is None:
            raise ValueError("output_path must be provided when model_save is set to True")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Create the output directory if it doesn't exist
        with open(output_path, "w"): pass
        compute_item_similarity_incremental(combined_features_vectors, item_ids, top_n, batch_size, output_path)
        print(f"Item Similarity saved to {output_path}")
        item_similarity_dict = {}  # Return an empty dictionary when model_save is True
    timer_8 = time.time()
    print(f"Time to compute item similarity: {timer_8 - timer_7:.2f} seconds.")

    return item_similarity_dict

#### Helper Functions

View Recommendations:

In [None]:
# Function to quickly retrieve top n recommendations for a given product
def view_recs(item_similarity_dict: Dict[str, Dict[str, float]], prod_to_rec: str, n: int) -> pd.DataFrame:
    # Check if prod_to_rec is in the item_similarity_dict
    if prod_to_rec not in item_similarity_dict:
        print(f'{prod_to_rec} not found in the training data')
        return

    # Get the top n similar items
    top_n_similar_items = sorted(item_similarity_dict[prod_to_rec].items(), key=lambda x: x[1], reverse=True)[:n]

    # Create a DataFrame to display the results
    df_recommendations = pd.DataFrame(top_n_similar_items, columns=["related_products", "score"])

    return df_recommendations

Load Item Similarity Matrix (from JSON):

In [None]:
def load_item_similarity_json(file_path: str) -> Dict[str, Dict[str, float]]:
    item_similarity_dict = defaultdict(dict)
    
    with open(file_path, 'r') as f:
        for line in f:
            item_id, similarities = line.strip().split(':', 1)
            similarity_dict = {k: float(v) for k, v in (pair.split(':') for pair in similarities.split(','))}
            item_similarity_dict[item_id] = similarity_dict

    return item_similarity_dict

#### Model Training

Model 1

In [None]:
# # Hyperparameters for Model 1
# feat_combine = ['title', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc']
# ibcf1_hyperparams = {
#     'min_df': 5,                # Minimum number of documents a word must be present in to be kept
#     'max_df': 0.8,              # Maximum % of documents a word can be present in to be kept
#     'ngram_range': (1, 3)       # (min_n, max_n) the higher the n, the more computationally expensive
#     }

# # Specify product to get recommendations for
# prod_to_rec = 'B005HIMQPW'

# # Train model
# start_time = time.time()
# ibcf1_recos = item_based_cf_model_1(products_train, sessions_train, ibcf1_hyperparams, feat_combine)
# end_time = time.time()
# print(f'{end_time - start_time} seconds')

# # View recommendations
# ibcf1_recos_df = view_recs(ibcf1_recos, prod_to_rec, NUM_RECOMMENDATIONS)
# print(f'Recommendations for {prod_to_rec}:')
# ibcf1_recos_df

Model 2

In [None]:
# # Hyperparameters for Model 2
# ibcf2_hyperparams = {
#     'min_df': 5,                # Minimum number of documents a word must be present in to be kept
#     'max_df': 0.8,              # Maximum % of documents a word can be present in to be kept
#     'ngram_range': (1, 3),      # (min_n, max_n) the higher the n, the more computationally expensive
#     'feat_combine': ['title', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc']      # Combine this list of features
# }

# # Specify product to get recommendations for
# prod_to_rec = 'B005HIMQPW'

# # Train model
# start_time = time.time()
# ibcf2_recos = item_based_cf_model_2(products_train, sessions_train, ibcf2_hyperparams, NUM_RECOMMENDATIONS)
# end_time = time.time()
# print(f'{end_time - start_time} seconds')

# # View recommendations
# ibcf2_recos_df = view_recs(ibcf2_recos, prod_to_rec, NUM_RECOMMENDATIONS)
# print(f'Recommendations for {prod_to_rec}:')
# ibcf2_recos_df

Model 11

In [None]:
# Hyperparameters for Model 11
ibcf11_hyperparams = {
    'min_df': 5,                        # Minimum number of documents a word must be present in to be kept
    'max_df': 0.8,                      # Maximum % of documents a word can be present in to be kept
    'ngram_range': (1, 3),              # (min_n, max_n) the higher the n, the more computationally expensive
    'feat_combine': ['title',           # Combine this list of features
                     'brand',
                     'color',
                     'size',
                     'model',
                     'material',
                     'author',
                     'desc'],
    'incl_prod': 'all'                  # Options: 'all', 'prod_only', 'sess_only'
}
batch_size = 500
model_save = True
file_ext = 'json'
item_output = output_path + 'ibcf11_item_similarity' + '.' + file_ext

# Train model
start_time = time.time()
if model_save == True:
    item_based_cf_model_11(products_train, sessions_train, ibcf11_hyperparams, NUM_RECOMMENDATIONS, batch_size, model_save, item_output)
    ibcf11_recos = load_item_similarity_json(item_output)
else:
    ibcf11_recos = item_based_cf_model_11(products_train, sessions_train, ibcf11_hyperparams, NUM_RECOMMENDATIONS, batch_size, model_save, item_output)
end_time = time.time()
print(f'{end_time - start_time} seconds')

In [None]:
# Specify product to get recommendations for
prod_to_rec = 'B06XKPB3GT'

# View recommendations
ibcf11_recos_df = view_recs(ibcf11_recos, prod_to_rec, NUM_RECOMMENDATIONS)
print(f'Recommendations for {prod_to_rec}:')
ibcf11_recos_df

#### Evaluation: MRR

In [None]:
# Function to calculate the Mean Reciprocal Rank (MRR)
def calculate_mrr(item_similarity_dict: dict, test_data: pd.DataFrame, k: int) -> float:
    reciprocal_ranks = []

    for _, row in test_data.iterrows():
        prev_items = row["prev_items"].split(',')  # Extract the list of previous items
        current_item = prev_items[-1]  # Get the last item in the list as the current item
        next_item = row["next_item"]

        if current_item not in item_similarity_dict:
            continue

        sorted_similar_items = sorted(item_similarity_dict[current_item].items(), key=lambda x: x[1], reverse=True)

        for rank, (item, _) in enumerate(sorted_similar_items[:k], start=1):
            if item == next_item:
                reciprocal_ranks.append(1 / rank)
                break
        else:
            reciprocal_ranks.append(0)

    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0

    return mrr

In [None]:
# Calculate MRR for Model 11 (on generated test set)
mrr_ibcf11 = calculate_mrr(ibcf11_recos, s_test_de, NUM_RECOMMENDATIONS)
print(f'MRR for Model 11 (variant 1): {mrr_ibcf11}')

#### Generate/Save Recommendations

In [None]:
# Function to generate recommendations
def generate_recommendations(
    sessions_test: pd.DataFrame,
    item_similarity_dict: Dict[str, Dict[str, float]],
    top_n: int
) -> pd.DataFrame:

    recommendations = []

    for _, row in sessions_test.iterrows():
        session_id = row["session_id"]
        prev_items = row["prev_items"].split(",")

        session_recommendations = defaultdict(float)

        for item in prev_items:
            if item in item_similarity_dict:
                for rec_item, score in item_similarity_dict[item].items():
                    session_recommendations[rec_item] += score

        sorted_recommendations = sorted(session_recommendations.items(), key=lambda x: x[1], reverse=True)[:top_n]
        recommendations.append((session_id, [rec[0] for rec in sorted_recommendations]))

    recommendations_df = pd.DataFrame(recommendations, columns=["session_id", "next_item_prediction"])

    return recommendations_df


# Function to save recommendations to parquet file
def save_recs_to_pqt(recommendations_df: pd.DataFrame, output_file: str) -> None:
    table = pa.Table.from_pandas(recommendations_df)
    pq.write_table(table, output_file, compression='gzip')

In [None]:
# Let's generate recommendations on sessions_test using Model 11
recommendations_df = generate_recommendations(sessions_test, ibcf11_recos, NUM_RECOMMENDATIONS)
recommendations_df

In [None]:
# Let's save the recommendations to a parquet file
save_recs_to_pqt(recommendations_df, output_file)