In [None]:
from typing import Mapping, Sequence

import numpy as np
import pandas as pd
import polars as pl

K = 10
ndcg_weights = 1.0 / np.log2(np.arange(0, K) + 2)
ndcg_idcg = ndcg_weights.cumsum()


def evaluate(ref_path, pred_path, train_path):

    submission = pl.read_parquet(pred_path)
    ref_df = pl.read_parquet(ref_path)
    train = pl.read_parquet(train_path)

    submission = (
        submission
        .select(
            pl.col("user_id").cast(pl.Int64),
            pl.col("item_id").cast(pl.List(pl.Int64)).alias("predicted"),
        )
        .unique(subset="user_id")
        .with_columns(
            pl.col("predicted").list.unique(maintain_order=True)
        )
    )

    ground_truth = ref_df.with_columns(pl.col("item_id").alias("ground_truth"))

    submission_with_gt = ground_truth.join(submission, on="user_id", how="left")

    metrics_per_user = submission_with_gt.select(
        pl.col("user_id"),
        pl.struct("predicted", "ground_truth").map_elements(ndcg_per_user, return_dtype=float).alias("ndcg"),
    )
    mean_ndcg = metrics_per_user.select(pl.col("ndcg").mean())["ndcg"][0]

    metrics_per_user = submission_with_gt.select(
        pl.col("user_id"),
        pl.struct("predicted", "ground_truth").map_elements(hitrate_per_user, return_dtype=float).alias("hitrate"),
    )
    mean_hitrate = metrics_per_user.select(pl.col("hitrate").mean())["hitrate"][0]

    coverage = compute_coverage(submission, train)
    novelty = compute_novelty(submission, train)

    return {'ndcg': mean_ndcg,
            'hitrate': mean_hitrate,
            'coverage': coverage,
            'novelty': novelty}


def ndcg_per_user(pl_struct: Mapping[str, Sequence[int]]) -> float:

    predicted = pl_struct["predicted"]
    ground_truth = pl_struct["ground_truth"]

    if predicted is None:
        return 0.0

    assert ground_truth is not None
    assert len(ground_truth) > 0

    predicted_np = np.array(predicted[:K])
    ground_truth_np = np.array(ground_truth)

    predicted_count = min(len(predicted_np), K)
    gt_count = min(len(ground_truth_np), K)

    hits = (predicted_np.reshape(-1, 1) == ground_truth_np.reshape(1, -1)).sum(axis=1)
    dcg = (hits * ndcg_weights[:predicted_count]).sum()
    idcg = ndcg_idcg[gt_count - 1]
    ndcg = dcg / idcg
    return ndcg


def hitrate_per_user(pl_struct: Mapping[str, Sequence[int]]) -> float:

    predicted = pl_struct["predicted"]
    ground_truth = pl_struct["ground_truth"]

    if predicted is None:
        return 0.0

    assert ground_truth is not None
    assert len(ground_truth) > 0

    predicted_np = np.array(predicted[:K])
    ground_truth_np = np.array(ground_truth)

    hitrate = int(len(np.intersect1d(predicted_np, ground_truth_np)) > 0)

    return hitrate


def compute_coverage(submission, train):

    list_of_lists = submission.select('predicted').to_series().to_list()
    all_pred_items = [x for xs in list_of_lists for x in xs]
    all_pred_items = set(all_pred_items)

    all_train_items = train.select('item_id').unique().to_series().to_list()

    coverage = len(all_pred_items.intersection(all_train_items)) / len(all_train_items)

    return coverage


def compute_novelty(submission, train):

    num_interactions = len(train)
    item_stats = train.group_by('item_id').count()
    item_stats = item_stats.with_columns(-np.log2(pl.col('count') / num_interactions).alias('item_novelty'))
    item_stats = item_stats.with_columns((pl.col('item_novelty') / np.log2(num_interactions)))
    item_stats = item_stats.select('item_id', 'item_novelty').to_pandas()

    list_of_lists = submission.select('predicted').to_series().to_list()
    all_pred_items = [x for xs in list_of_lists for x in xs]
    num_recommendations = len(all_pred_items)

    recs_items = pd.Series(all_pred_items).value_counts().reset_index()
    recs_items.columns = ['item_id', 'item_count']
    recs_items = pd.merge(recs_items, item_stats)
    recs_items['product'] = recs_items['item_count'] * recs_items['item_novelty']

    novelty = recs_items['product'].sum() / num_recommendations

    return novelty

In [None]:
prediction_path = "my_submission_smm.parquet"
ground_truth_path = "test_smm.parquet"
train_path = "train_smm.parquet"

In [None]:
evaluate(ground_truth_path, prediction_path, train_path)

  item_stats = train.group_by('item_id').count()


{'ndcg': None,
 'hitrate': None,
 'coverage': 5.969721572185873e-05,
 'novelty': 0.425445279591621}

In [None]:
! python3 train_predict.py

n-core filtering step 1
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
2024-11-23 20:28:20.329857: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 20:28:20.358147: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 20:28:20.366682: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 20:28:20.392349: I tensorflow/core/platfor

In [None]:
! cp ../train_smm.parquet train_smm.parquet

In [None]:
%cd seqs
# ! ls

/content/drive/MyDrive/Makaki_RecSys/seqs


In [None]:
! zip -r seqs_als_submission.zip ALS.py  train_predict.py SASReq_model.py datasets.py models.py modules.py postprocess.py requirements.txt split.py

updating: ALS.py (deflated 73%)
updating: SASReq_model.py (deflated 66%)
updating: models.py (deflated 73%)
updating: modules.py (deflated 74%)
updating: postprocess.py (deflated 55%)
updating: requirements.txt (deflated 15%)
updating: split.py (deflated 69%)
updating: train_predict.py (deflated 70%)
updating: datasets.py (deflated 75%)


In [None]:
! rm -r seqs/__pycache__

In [None]:
import pandas as pd

In [None]:
s = pd.Series([1, 2, 3], index=[10, 11, 12])
s.reset_index().iloc[:1,:]

Unnamed: 0,index,0
0,10,1


In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
# %cd /content/drive/MyDrive/Makaki_RecSys


In [None]:
! tar -xvf seqs.tar

seqs/
seqs/__pycache__/
seqs/__pycache__/datasets.cpython-39.pyc
seqs/__pycache__/models.cpython-39.pyc
seqs/__pycache__/modules.cpython-39.pyc
seqs/__pycache__/postprocess.cpython-39.pyc
seqs/__pycache__/split.cpython-39.pyc
seqs/models.py
seqs/.ipynb_checkpoints/
seqs/.ipynb_checkpoints/SASRec_zvuk_final-checkpoint.ipynb
seqs/modules.py
seqs/postprocess.py
seqs/split.py
seqs/datasets.py
seqs/SASRec_zvuk_final.ipynb


In [None]:
import pandas as pd
sub = pd.read_parquet("submission_zvuk.parquet")
sub

In [None]:
sub

total 1.8G
 38K -rw------- 1 root root  38K Nov 23 14:34 metrics.ipynb
5.5K -rw------- 1 root root 5.4K Nov 23 14:32 top10_train_predict.py
4.0K drwx------ 2 root root 4.0K Nov 23 14:29 __pycache__
4.5K -rw------- 1 root root 4.2K Nov 23 14:09 raw_train_predict.py
149K -rw------- 1 root root 149K Nov 23 13:41 Makarov_Sasha.ipynb
1.5K -rw------- 1 root root 1.1K Nov 23 13:39 train_predict.py
 36K -rw------- 1 root root  36K Nov 23 13:09 sus_submission.zip
237M -rw------- 1 root root 237M Nov 23 12:47 SASRec_ranks-2.csv
130K -rw------- 1 root root 130K Nov 23 12:13 seqs.tar
4.0K drwx------ 3 root root 4.0K Nov 23 12:12 seqs
3.0K -rw------- 1 root root 2.8K Nov 23 12:12 feature_func.py
238M -rw------- 1 root root 238M Nov 23 11:50 SASRec_ranks.csv
1.5K -rw------- 1 root root 1.4K Nov 23 11:29 make_features.py
2.5K -rw------- 1 root root 2.4K Nov 23 11:28 split.py
2.0K -rw------- 1 root root 1.8K Nov 23 11:08 top10_submission.zip
 60M -rw------- 1 root root  60M Nov 23 11:07 submission_zvu

In [None]:
sub_smm = pd.read_parquet("my_submission_smm.parquet").iloc[:2, :]
sub_smm

Unnamed: 0,user_id,item_id
0,2353151,"[74610, 20776, 57107, 15432, 31984, 20166, 845..."
1,9824229,"[74610, 20776, 57107, 15432, 31984, 20166, 845..."


In [None]:
sus = pd.DataFrame({
    # 'index': [1, 2, 3, 4, 5],
    'user_id': [11, 11, 11, 12, 12],
    'item_id': [21, 22, 23, 24, 25],
    'prediction': [1, 1, 2, 6, 5]
})


def SusRec_to_answer_df(pred: pd.DataFrame, k=10) -> pd.DataFrame:
    d = {
        'user_id': [],
        'item_id': [],
    }
    for uid in pred.user_id.unique():
        sorted = pred[pred.user_id == uid].sort_values('prediction', ascending=False)
        top_k = sorted.item_id.to_list()[:k]

        d['user_id'].append(uid)
        d['item_id'].append(top_k)
    return pd.DataFrame(d)

SusRec_to_answer_df(sus, k=2)


Unnamed: 0,user_id,item_id
0,11,"[23, 21]"
1,12,"[24, 25]"


In [None]:
result_df = []
test_smm = pd.read_parquet("test_smm.parquet")
smm_set = test_smm.groupby('user_id')['item_id'].apply(set).reset_index()

for user_id in recs['user_id'].unique():
    user_items = recs[recs['user_id'] == user_id]['item_id'].iloc[0]
    pred_items = []
    for item_id in user_items:
        if item_id not in smm_set[smm_set['user_id'] == user_id]['item_id'].iloc[0]:
            pred_items.append(item_id)
        if len(pred_items) == 10:
            break
    if len(pred_items) != 10:
        i = 0
        while len(pred_items) != 10:
            pred_items.append(top_items[i])
            i += 1


    result_df.append({'user_id': user_id, 'item_ids': pred_items})

In [None]:
recs = pd.Series([np.array(x) for x in recs.tolist()], index=user_ids)
recs = recs.reset_index()
recs.columns = ["user_id", "item_id"]

prediction_path = Path(cfg_data["data_dir"]) / f"submission_smm.parquet"
recs.to_parquet(prediction_path)


In [None]:
import EASE
import pandas as pd
from scipy.sparse import csr_matrix

train = pd.read_parquet("train_smm.parquet").iloc[:10000, :]


In [None]:

# pd.factorize(train['user_id'])
s = pd.Series([1, 5, 3, 3, 5, 7, 9])
codes, uniques = s.factorize()
codes, uniques


(array([0, 1, 2, 2, 1, 3, 4]), Index([1, 5, 3, 7, 9], dtype='int64'))

In [None]:
# train_X = sp.csr_matrix((train_df.rating, (train_df.user_session_id, train_df.item_id)), shape=(n_user_sessions, n_items))
codes_uids, _ = train.user_id.factorize()
codes_itemids, _ = train.item_id.factorize()

n_users = train.user_id.nunique()
n_items = train.item_id.nunique()

smm_train_csr = csr_matrix((train.rating,
    (codes_uids, codes_itemids)), shape=(n_users, n_items))
ease = EASE.EASE(l2=0.02)
ease.fit(smm_train_csr)

Constructing G...
Density of G: 0.1588%
Inverting G...


In [None]:
ease.B.shape, n_users, n_items

((8344, 8344), 2469, 8344)

In [None]:
! git clone https://github.com/matospiso/recsys24-abs-ease.git

Cloning into 'recsys24-abs-ease'...
remote: Enumerating objects: 273, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 273 (delta 46), reused 37 (delta 27), pack-reused 200 (from 1)[K
Receiving objects: 100% (273/273), 21.42 MiB | 14.22 MiB/s, done.
Resolving deltas: 100% (149/149), done.
