# Task description
Creating a recommener system for ad recommendations on a marketplace.\
The goal is to improve the algorithm that will determine the most relevant recommendations 
for each user based on user and product attributes, as well as the history of interactions between them.

Initially, a combination of the ALS algorithm and an additional classification model (CatBoost) 
for re-ranking was tested to address this issue, as well as various special approaches 
to grouping and combining data to improve ranking quality.
However, it turned out that the standard BPR algorithm yielded significantly better results in this case. 

In [None]:
import random
from typing import List, Any
from tqdm import tqdm

import polars as pl
import numpy as np
import scipy.sparse as sp
import optuna
import implicit
import faiss
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct, SearchRequest

# Data loading

In [None]:
ad_features = pl.read_parquet('data/ad_features.parquet')
user_features = pl.read_parquet('data/user_features.parquet')
behavioral_logs = pl.read_parquet('data/behavior_logs.parquet')
data = pl.read_parquet('data/train.parquet')
test = pl.read_parquet('data/test.parquet')

# Metrics

In [None]:
TOP_K = 10

def user_intersection(y_rel: List[Any], y_rec: List[Any], k: int = TOP_K) -> int:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: number of items in intersection of y_rel and y_rec (truncated to top-K)
    """
    return len(set(y_rec[:k]).intersection(set(y_rel)))

def user_recall(y_rel: List[Any], y_rec: List[Any], k: int = TOP_K) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: percentage of found relevant items through recommendations
    """
    denom = min(k, len(set(y_rel)))
    return user_intersection(y_rel, y_rec, k) / denom


def user_ndcg(y_rel: List[Any], y_rec: List[Any], k: int = TOP_K) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: ndcg metric for user recommendations
    """
    dcg = sum([1. / np.log2(idx + 2) for idx, item in enumerate(y_rec[:k]) if item in y_rel])
    idcg = sum([1. / np.log2(idx + 2) for idx, _ in enumerate(zip(y_rel, np.arange(k)))])
    return dcg / idcg

In [None]:
full_data = data.sort('time_stamp')

max_user_id = 100_000 #full_data['user'].max() + 1
#part_time = data['time_stamp'].quantile(0.3)
part_data = full_data # full_data.filter(pl.col('user') <= max_user_id)

timestamp_threshold = part_data['time_stamp'].quantile(0.9)
train_df = part_data.filter(pl.col('time_stamp') <= timestamp_threshold)
test_df = part_data.filter(pl.col('time_stamp') > timestamp_threshold)

# Common functions

In [None]:
RANDOM_STATE = 42
TOP_K = 10

def set_seed():
    random.seed(RANDOM_STATE)
    np.random.seed(RANDOM_STATE)
    
    
def get_recommendations(user_embs: np.array, item_embs: np.array, k: int = TOP_K):
    '''
    Searching for closest points based on users and items embeddings using faiss. 
    '''
    # To create faiss index
    index = faiss.IndexFlatIP(item_embs.shape[1])
    index.add(item_embs)

    return index.search(user_embs, k)

## Qdrant setting

In [None]:
def qdrant_recommend(user_embs, item_embs, dist=Distance.DOT, k=TOP_K):
    '''
    Helper function for creating a qdrant collection and performing approximate nearest neighbors searching.
    '''
    client = QdrantClient("localhost", port=6333)
    client.recreate_collection(
        collection_name="item_embs",
        # задаем размерность векторов и метрику дистанции
        vectors_config=VectorParams(size=item_embs.shape[1], distance=dist),
    )
    vecs = [
        PointStruct(id=(item_id+1), vector=item_emb.tolist())
        for item_id, item_emb in enumerate(item_embs[1:])
    ]
    bs = 1_000
    n = len(vecs)//bs + 1

    # The collection is created in batches because otherwise a timeout exception occurs.
    print('Create collection')
    for i in tqdm(range(n)):
        operation_info = client.upsert(
            collection_name="item_embs",
            wait=True,
            points=vecs[i*bs : (i+1)*bs]
        )
        
    recs_qdrant = []
    for u_emb in tqdm(user_embs):
        qd_points = client.query_points(
            collection_name="item_embs",
            query=u_emb,
            limit=TOP_K
        ).points
        recs_qdrant.append([s.id for s in qd_points])
        
    return np.array(recs_qdrant)

# ALS

## Train/test preparing

In [None]:
df_train_grouped = (
    train_df
        #.filter(pl.col('clk')==1)
        .sort(by='time_stamp')
        .group_by('user')
        .agg([
            pl.col('adgroup_id').alias('ad_history'),
            pl.col('clk').alias('cliks')
        ])
       # .with_columns(pl.col('ad_history').map_elements(len, return_dtype=pl.Int16).alias('history_length'))
)

df_test_grouped = (
      test_df
        .filter(pl.col('clk')==1)
        .sort(by='time_stamp')
        .group_by('user')
        .agg(pl.col('adgroup_id').alias('ad_rel'))
       # .with_columns(pl.col('ad_history').map_elements(len, return_dtype=pl.Int16).alias('history_length'))
  
)

In [None]:
median_seq_len = int(df_train_grouped['ad_history'].map_elements(len,return_dtype=pl.Int16).mean())
print(f"average session length {median_seq_len}")

In [None]:
int(df_test_grouped['ad_rel'].map_elements(len,return_dtype=pl.Int16).mean())
#print(f"средняя длина сессии {median_seq_len}")

In [None]:
%%time
rows = []
cols = []
values = []

for user_id, train_ad_ids, train_clicks in df_train_grouped.select('user', 'ad_history', 'cliks').rows():
    rows.extend([user_id] * len(train_ad_ids))
    cols.extend(train_ad_ids)
    values.extend(train_clicks)

user_ad_data = sp.csr_matrix((values, (rows, cols)), dtype=np.float32)
user_ad_data

## Rough try

In [None]:
import os
os.environ['OPENBLAS_NUM_THREADS']='1'

In [None]:
%%time
set_seed()
als_model = implicit.als.AlternatingLeastSquares(
    num_threads=-1,
    factors=48,
    iterations=15,
    random_state=RANDOM_STATE,
    regularization=1e-2,
    alpha=100,
)
als_model.fit(user_ad_data)

In [None]:
test_users = df_test_grouped['user']
recs, _ = als_model.recommend(test_users, user_ad_data[test_users], TOP_K, filter_already_liked_items=False)

ndcg_list = []
recall_list = []
for y_rec, (_, y_rel) in zip(recs, df_test_grouped.rows()):
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    recall_list.append(user_recall(y_rel, y_rec))

mean_ndcg = np.mean(ndcg_list)
mean_recall = np.mean(recall_list)
print(f'NDCG@{TOP_K} = {mean_ndcg:.4f}, Recall@{TOP_K} = {mean_recall:.4f}')

In [None]:
test_users = df_test_grouped['user']
recs, _ = als_model.recommend(test_users, user_ad_data[test_users], TOP_K, filter_already_liked_items=False)

ndcg_list = []
recall_list = []
for y_rec, (_, y_rel) in zip(recs, df_test_grouped.rows()):
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    recall_list.append(user_recall(y_rel, y_rec))

mean_ndcg = np.mean(ndcg_list)
mean_recall = np.mean(recall_list)
print(f'NDCG@{TOP_K} = {mean_ndcg:.4f}, Recall@{TOP_K} = {mean_recall:.4f}')

In [None]:
ndcg_list = []
recall_list = []
als_recs = qdrant_recommend(als_model.user_factors, als_model.item_factors)
for user_id, y_rel in df_test_grouped.rows():
    if user_id > als_recs.shape[0]:
        ndcg_list.append(0)
        recall_list.append(0)
        continue
    #_, recs = index.search(user_embs[user_id].reshape(1,-1), TOP_K + median_seq_len)
    y_rec = als_recs[user_id]
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    recall_list.append(user_recall(y_rel, y_rec))
mean_recall = np.mean(recall_list)
mean_ndcg = np.mean(ndcg_list)
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(recall_list):.4f}')
    

## Hyperparams tuning

In [None]:
def objective(trial):
    factors = trial.suggest_int('factors', 10, 50)
    iterations = trial.suggest_int('iterations', 15, 50)
    alpha = trial.suggest_float('alpha', 1, 5.0)
    regularization = trial.suggest_float('regularization', 1e-3, 1.0)

    print({
        'factors': factors,
        'iterations': iterations,
        'alpha': alpha,
        'regularization': regularization,
    })

    set_seed()
    als_model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        iterations=iterations,
        random_state=RANDOM_STATE,
        alpha=alpha,
        regularization=regularization,
    )
    als_model.fit(user_ad_data)

    als_recs = qdrant_recommend(
        als_model.user_factors,
        als_model.item_factors,
        TOP_K
    )

    ndcg_list = []
    recall_list = []
    for user_id, y_rel in df_test_grouped.rows():
        if user_id > als_recs.shape[0]:
            ndcg_list.append(0)
            recall_list.append(0)
            continue
        #_, recs = index.search(user_embs[user_id].reshape(1,-1), TOP_K + median_seq_len)
        y_rec = als_recs[user_id]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
        recall_list.append(user_recall(y_rel, y_rec))
    mean_recall = np.mean(recall_list)
    mean_ndcg = np.mean(ndcg_list)
    print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(recall_list):.4f}')
    
    return mean_recall

study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=100)

study.best_params

Trial 46 finished with value: 0.01292217625267029 and parameters: \
{'factors': 50, 'iterations': 23, 'alpha': 4.600328178480976, 'regularization': 0.12200529914142472}. \
Best is trial 46 with value: 0.01292217625267029.

# BPR

## Train/test preparing

In [None]:
df_train_grouped = (
    train_df
        #.filter(pl.col('clk')==1)
        .sort(by='time_stamp')
        .group_by('user')
        .agg([
            pl.col('adgroup_id').alias('ad_history'),
            pl.col('clk').alias('cliks')
        ])
       # .with_columns(pl.col('ad_history').map_elements(len, return_dtype=pl.Int16).alias('history_length'))
)

df_test_grouped = (
      test_df
        .filter(pl.col('clk')==1)
        .sort(by='time_stamp')
        .group_by('user')
        .agg(pl.col('adgroup_id').alias('ad_rel'))
       # .with_columns(pl.col('ad_history').map_elements(len, return_dtype=pl.Int16).alias('history_length'))
  
)

In [None]:
median_seq_len = int(df_train_grouped['ad_history'].map_elements(len,return_dtype=pl.Int16).mean())
print(f"average session length {median_seq_len}")

In [None]:
int(df_test_grouped['ad_rel'].map_elements(len,return_dtype=pl.Int16).mean())

In [None]:
%%time
rows = []
cols = []
values = []

for user_id, train_ad_ids, train_clicks in df_train_grouped.select('user', 'ad_history', 'cliks').rows():
    rows.extend([user_id] * len(train_ad_ids))
    cols.extend(train_ad_ids)
    values.extend(train_clicks)

user_ad_data = sp.csr_matrix((values, (rows, cols)), dtype=np.float32)
user_ad_data

## BPR rough try

In [None]:
params = {'factors': 148, 'iterations': 100, 'learning_rate': 0.15929002533450304, 'regularization': 0.0007903221600387114} # 0.0532

bpr_model = implicit.bpr.BayesianPersonalizedRanking(
    random_state=RANDOM_STATE,
    **params
)
bpr_model.fit(user_ad_data)

In [None]:
bpr_model.save("bpr_model_test.npz")

In [None]:
bpr_model_saved = implicit.bpr.BayesianPersonalizedRanking().load("bpr_model_test.npz") # implicit.bpr.BayesianPersonalizedRanking.load("bpr_model_test.npz")

In [None]:
ndcg_list = []
recall_list = []
bpr_recs = qdrant_recommend(bpr_model.user_factors, bpr_model.item_factors)

for user_id, y_rel in df_test_grouped.rows():
    if user_id > bpr_recs.shape[0]:
        ndcg_list.append(0)
        recall_list.append(0)
        continue
    #_, recs = index.search(user_embs[user_id].reshape(1,-1), TOP_K + median_seq_len)
    y_rec = bpr_recs[user_id]
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    recall_list.append(user_recall(y_rel, y_rec))
mean_recall = np.mean(recall_list)
mean_ndcg = np.mean(ndcg_list)
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(recall_list):.4f}')
    

In [None]:
ndcg_list = []
recall_list = []
#bpr_recs = qdrant_recommend(bpr_model.user_factors, bpr_model.item_factors)

for user_id, y_rel in tqdm(df_test_grouped.rows()):
    y_rec, _ = bpr_model.recommend(userid=user_id, user_items=user_ad_data[user_id], filter_already_liked_items=False)
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    recall_list.append(user_recall(y_rel, y_rec))
mean_recall = np.mean(recall_list)
mean_ndcg = np.mean(ndcg_list)
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(recall_list):.4f}')
    

In [None]:
%%time
bpr_recs, _ = bpr_model_saved.recommend(userid=df_test_grouped['user'], user_items=user_ad_data[df_test_grouped['user']], filter_already_liked_items=False)

## Hyperparams tuning

In [None]:
def objective(trial):
    factors = trial.suggest_int('factors', 140, 220)
    iterations = trial.suggest_int('iterations', 70, 220)
    learning_rate = trial.suggest_float('learning_rate', 5e-2, 2e-1)
    regularization = trial.suggest_float('regularization', 1e-4, 2e-2)
  
    print({
        'factors': factors,
        'iterations': iterations,
        'learning_rate': learning_rate,
        'regularization': regularization,
    })

    set_seed()
    bpr_model = implicit.bpr.BayesianPersonalizedRanking(
        factors=factors,
        iterations=iterations,
        random_state=RANDOM_STATE,
        learning_rate=learning_rate,
        regularization=regularization
    )
    bpr_model.fit(user_ad_data)

    # bpr_recs = qdrant_recommend(
    #     bpr_model.user_factors,
    #     bpr_model.item_factors
    # )
    bpr_recs, _ = bpr_model.recommend(userid=df_test_grouped['user'], 
                                      user_items=user_ad_data[df_test_grouped['user']], 
                                      filter_already_liked_items=False)

    ndcg_list = []
    recall_list = []
    for i, (user_id, y_rel) in enumerate(df_test_grouped.rows()):
        # if user_id > bpr_recs.shape[0]:
        #     ndcg_list.append(0)
        #     recall_list.append(0)
        #     continue
        #y_rec = bpr_recs[user_id]
        y_rec = bpr_recs[i]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
        recall_list.append(user_recall(y_rel, y_rec))
    mean_recall = np.mean(recall_list)
    mean_ndcg = np.mean(ndcg_list)
    print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(recall_list):.4f}')
    
    return mean_recall

study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=100)

study.best_params

## Submission

In [None]:
df_full_grouped = (
    full_data
        #.filter(pl.col('clk')==1)
        .sort(by='time_stamp')
        .group_by('user')
        .agg([
            pl.col('adgroup_id').alias('ad_history'),
            pl.col('clk').alias('cliks')
        ])
)

rows = []
cols = []
values = []

for user_id, train_ad_ids, train_clicks in df_full_grouped.select('user', 'ad_history', 'cliks').rows():
    rows.extend([user_id] * len(train_ad_ids))
    cols.extend(train_ad_ids)
    values.extend(train_clicks)

full_ad_data = sp.csr_matrix((values, (rows, cols)), dtype=np.float32)
full_ad_data


In [None]:
params = {'factors': 184, 'iterations': 139, 'learning_rate': 0.121437862923385, 'regularization': 0.0034908722792852008} # 0.0546

bpr_model = implicit.bpr.BayesianPersonalizedRanking(
    random_state=RANDOM_STATE,
    **params
)
bpr_model.fit(full_ad_data)

In [None]:
bpr_model_suffix = '184_139_12_003'
bpr_model = implicit.bpr.BayesianPersonalizedRanking().load(f"./bpr_{bpr_model_suffix}.npz")

In [None]:
bpr_recs, _ = bpr_model.recommend(userid=test['user_id'], user_items=full_ad_data[test['user_id']], filter_already_liked_items=False)

In [None]:
np.save("bpr_subm_recs.npy", bpr_recs)

In [None]:
bpr_recs

In [None]:
df_sub_bpr = test.with_columns(pl.Series("y_rec", bpr_recs))
df_sub_bpr.write_parquet(f"bpr_{bpr_model_suffix}.parquet")