In [None]:
import json
import random
from tqdm import tqdm
from typing import List, Any

import optuna
import pandas as pd
import polars as pl
import numpy as np
import scipy.sparse as sp
from gensim.models import Word2Vec

## Task description

The task is to create a recommendation system for a proprietary streaming service similar to Spotify.
The goal is to improve the algorithm that will determine the most relevant recommendations for each user based on their listening history.

### Data description

The dataset contains only the user's listening history, without any additional information such as artist description or additional information about the user.

## Dataset reading

In [None]:
data = pl.read_parquet('train.parquet')
data

## Metrics

In [None]:
TOP_K = 20

def user_hitrate(y_relevant: List[str], y_recs: List[str], k: int = TOP_K) -> int:
    return int(len(set(y_relevant).intersection(y_recs[:k])) > 0)

def user_ndcg(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: ndcg metric for user recommendations
    """
    dcg = sum([1. / np.log2(idx + 2) for idx, item in enumerate(y_rec[:k]) if item in y_rel])
    idcg = sum([1. / np.log2(idx + 2) for idx, _ in enumerate(zip(y_rel, np.arange(k)))])
    return dcg / idcg

## Data preparation

In [None]:
"""
Mapping strings to digits for convenience in subsequent training of algorithms
"""
user_mapping = {k: v for v, k in enumerate(data['user_id'].unique())}
user_mapping_inverse = {k: v for v, k in user_mapping.items()}

artist_mapping = {k: v for v, k in enumerate(data['artist_id'].unique())}
artist_mapping_inverse = {k: v for v, k in artist_mapping.items()}

In [None]:
grouped_df_with_inds = (
    data
    .with_columns([
        pl.col('user_id').map_elements(user_mapping.get, return_dtype=pl.Int64),
        pl.col('artist_id').map_elements(artist_mapping.get, return_dtype=pl.Int64),
    ])
    # For each user, the last 3 objects are kept as test samples, and the rest are used for training.
    .group_by('user_id')
    .agg([
        pl.implode('artist_id').map_elements(lambda x: x[:-3], return_dtype=pl.self_dtype()).alias('train_item_ids'),
        pl.implode('artist_id').map_elements(lambda x: x[-3:], return_dtype=pl.self_dtype()).alias('test_item_ids'),
    ])
)

In [None]:
median_seq_len = int(grouped_df_with_inds['train_item_ids'].map_elements(len, return_dtype=pl.Int8).median())
print(f"средняя длина сессии {median_seq_len}")

## Baseline

As a simple baseline of recommender system, the the most popular artists from training data are used. 

In [None]:
grouped_df_with_inds.head()

In [None]:
# Calculation of the top atrists 

top_artists = (
    grouped_df_with_inds
    .select(pl.col('train_item_ids').alias('artist_id'))
    .explode('artist_id')
    .group_by('artist_id')
    .len()
    .sort('len', descending=True)
    .head(TOP_K + median_seq_len)
)['artist_id'].to_list()

In [None]:
# Quality evaluation

ndcg_list = []
hitrate_list = []

for user_id, user_history, y_rel in grouped_df_with_inds.rows():
    y_rec = [artist_id for artist_id in top_artists if artist_id not in user_history]
    
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
    
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.5f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.5f}')

Не забывайте, что в файле с рекомендациями должны быть **исходные идентификаторы (строки)**, а не преобразованные в числа!

# Word2Vec

## Main funcitons

In [None]:
SEED = 42

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

In [None]:
%%time
ndcg_list = []
hitrate_list = []
  
def evaluate_model(model):
    ndcg_list = []
    hitrate_list = []
    for user_id, train_ids, y_rel in grouped_df_with_inds.rows():
        model_preds = model.predict_output_word(
            train_ids, topn=(TOP_K + len(train_ids))
        )
        if model_preds is None:
            hitrate_list.append(0)
            continue

        y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
        hitrate_list.append(user_hitrate(y_rel, y_rec))
    return np.mean(ndcg_list), np.mean(hitrate_list)

## Training W2V

In [None]:
# Training with default parameters

set_seed(SEED)
model = Word2Vec(grouped_df_with_inds['train_item_ids'].to_list(), **params, workers=4, epochs=10)
mean_ndcg, mean_hitrate = evaluate_model(model)
print(f'MAP@{TOP_K} = {mean_ndcg:.4f} Hitrate@{TOP_K} = {mean_hitrate:.4f}')

### Searching for best hyperparameters

In [None]:
def objective(trial):
    sg = trial.suggest_categorical('sg', [0, 1])
    window = trial.suggest_int('window', 1, 10)
    ns_exponent = trial.suggest_float('ns_exponent', -3, 3)
    negative = trial.suggest_int('negative', 1, 20)
    min_count = trial.suggest_int('min_count', 0, 20)
    vector_size = trial.suggest_categorical('vector_size', [16, 32, 64, 128])
    
    print({
        'sg': sg,
        'window_len': window,
        'ns_exponent': ns_exponent,
        'negative': negative,
        'min_count': min_count,
        'vector_size': vector_size,
    })
    
    set_seed(SEED)
    model = Word2Vec(
        grouped_df_with_inds['train_item_ids'].to_list(),
        window=window,
        sg=sg,
        hs=0,
        min_count=min_count,
        vector_size=vector_size,
        negative=negative,
        ns_exponent=ns_exponent,
        seed=SEED,
        epochs=5,
    )
    
    mean_ndcg, mean_hitrate = evaluate_model(model)
    print(f'MAP@{TOP_K} = {mean_ndcg:.4f} Hitrate@{TOP_K} = {mean_hitrate:.4f}')
    return mean_hitrate
    
    
study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=100)

study.best_params

In [None]:
# Saving the best trials
with open("optuna_best_params.json", 'w') as f:
    json.dump(study.best_params, f)

# Submission preparation

In [None]:
submission = []
df_subm = data.group_by('user_id').agg(pl.col('artist_id'))
params = study.best_params

set_seed(SEED)
model_subm = Word2Vec(df_subm['artist_id'].to_list(), **params, workers=4, epochs=20)

for user_id, user_history in df_subm.rows():
    model_preds = model_subm.predict_output_word(
        user_history, topn=(TOP_K + len(train_ids))
    )
    if model_preds is None:
        y_rec = top_artists.copy()
        continue

    y_rec = [pred[0] for pred in model_preds if pred[0] not in user_history]
    submission.append((user_id, y_rec))
    
submission = pl.DataFrame(submission, schema=('user_id', 'y_rec'), orient="row")
submission.write_parquet('submission.parquet')
submission