In [None]:
!pip install recbole
!pip install torch==2.5.1

In [None]:
from typing import Literal
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
import os
import torch

In [None]:
class YambdaDataset:
    INTERACTIONS = frozenset([
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ])

    def __init__(
        self,
        dataset_type: Literal["flat", "sequential"] = "flat",
        dataset_size: Literal["50m", "500m", "5b"] = "50m"
    ):
        assert dataset_type in {"flat", "sequential"}
        assert dataset_size in {"50m", "500m", "5b"}
        self.dataset_type = dataset_type
        self.dataset_size = dataset_size

    def interaction(self, event_type: Literal[
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ]) -> Dataset:
        assert event_type in YambdaDataset.INTERACTIONS
        return self._download(f"{self.dataset_type}/{self.dataset_size}", event_type)

    def audio_embeddings(self) -> Dataset:
        return self._download("", "embeddings")

    def album_item_mapping(self) -> Dataset:
        return self._download("", "album_item_mapping")

    def artist_item_mapping(self) -> Dataset:
        return self._download("", "artist_item_mapping")

    @staticmethod
    def _download(data_dir: str, file: str) -> Dataset:
        data = load_dataset("yandex/yambda", data_dir=data_dir, data_files=f"{file}.parquet")
        # Returns DatasetDict; extracting the only split
        assert isinstance(data, DatasetDict)
        return data["train"]

In [None]:
dataset = YambdaDataset("flat", "50m")
listens = dataset.interaction("listens")
likes = dataset.interaction("likes")

In [None]:
df_listens = listens.to_pandas()
df_likes = likes.to_pandas()

In [None]:
positive_interactions = pd.concat([
    df_likes[['uid', 'item_id', 'timestamp']],
    df_listens[['uid', 'item_id', 'timestamp']]
])

In [None]:
positive_interactions.rename(columns={
    'uid': 'user_id:token',
    'item_id': 'item_id:token',
    'timestamp': 'timestamp:float'
}, inplace=True)


In [None]:
positive_interactions

In [None]:
positive_interactions = positive_interactions.sample(frac=0.1, random_state=42)

In [None]:
dataset_name = "yambda_positive"
data_root = "/kaggle/working/data"
dataset_path = os.path.join(data_root, dataset_name)
os.makedirs(dataset_path, exist_ok=True)
inter_path = os.path.join(dataset_path, f"{dataset_name}.inter")
positive_interactions.to_csv(inter_path, index=False, sep="\t")

In [None]:
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.general_recommender import BPR
from recbole.trainer import Trainer

In [None]:
from recbole.model.general_recommender import (
    BPR, NeuMF, Pop
)

MODEL_REGISTRY = {
    "Pop": Pop,
    "BPR": BPR,
    "NeuMF": NeuMF,
}

In [None]:
models_to_test = ["Pop", "BPR", "NeuMF"]


In [None]:
results = []

for model_name in models_to_test:
    print(f"\n=== Running {model_name} ===")
    config = Config(
        model=model_name,
        dataset=dataset_name,
        config_dict={
            "data_path": data_root,
            "epochs": 1,
            "train_batch_size": 2048,
        }
    )

    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    model_class = MODEL_REGISTRY[model_name]
    model = model_class(config, train_data.dataset).to(config['device'])
    trainer = Trainer(config, model)

    trainer.fit(train_data, valid_data, show_progress=True)
    test_result = trainer.evaluate(test_data)

    test_result["model"] = model_name
    results.append(test_result)

In [None]:
# Save results
df_results = pd.DataFrame(results)
df_results.to_csv("recbole_results.csv", index=False)
print(df_results)