In [10]:
from datetime import datetime

import matplotlib.pyplot  as plt
import numpy as np
import pandas as pd
import seaborn as sns
from replay.metrics import Experiment, HitRate, NDCG, Coverage, OfflineMetrics
from replay.preprocessing.filters import MinCountFilter, LowRatingFilter
from replay.splitters.time_splitter import TimeSplitter
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import os
if "HOME" not in os.environ and "USERPROFILE" in os.environ:
    os.environ["HOME"] = os.environ["USERPROFILE"]
    os.environ["HADOOP_HOME"] = r"C:\hadoop"

In [11]:
import pyspark
print(pyspark.__version__)

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()
print(spark.version)

3.4.4
3.4.1


In [12]:
data = pd.read_csv('interaction_data.csv')
itemdata = pd.read_csv('edadata.csv').drop(columns=['artist_name', 'track_name'])
itemdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23857 entries, 0 to 23856
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genre             23857 non-null  int64  
 1   popularity        23857 non-null  int64  
 2   acousticness      23857 non-null  float64
 3   danceability      23857 non-null  float64
 4   duration_ms       23857 non-null  int64  
 5   energy            23857 non-null  float64
 6   instrumentalness  23857 non-null  float64
 7   key               23857 non-null  int64  
 8   liveness          23857 non-null  float64
 9   loudness          23857 non-null  float64
 10  mode              23857 non-null  int64  
 11  speechiness       23857 non-null  float64
 12  tempo             23857 non-null  int64  
 13  time_signature    23857 non-null  int64  
 14  valence           23857 non-null  float64
 15  item_id           23857 non-null  int64  
 16  genre_name        23857 non-null  object

In [13]:
data['timestamp'] = pd.to_datetime(data['timestamp']).astype('int64') // 10**9

In [14]:
# Определяем временной порог как 80-й перцентиль
time_threshold = int(data['timestamp'].quantile(0.8))

train, test = TimeSplitter(
    time_threshold=time_threshold,
    drop_cold_users=True,
    drop_cold_items=True,
    query_column='user_id'
).split(data)

In [15]:
print('Юзеров в train = {}'.format(train.user_id.nunique()))
print('Айтемов в train = {}'.format(train.item_id.nunique()))

Юзеров в train = 10000
Айтемов в train = 23855


In [16]:
train

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,221806,3,1746372237
1,0,154981,3,1748931290
2,0,225744,1,1751332661
3,0,166571,3,1748075302
5,0,222646,4,1752226060
...,...,...,...,...
400059,9999,166393,3,1743809966
400060,9999,223553,4,1747153576
400061,9999,166976,2,1730043251
400062,9999,222818,3,1735458091


In [None]:
import pandas as pd
from replay.data.dataset import Dataset, FeatureSchema, FeatureInfo, FeatureHint, FeatureType
from replay.models import ItemKNN
from replay.utils.spark_utils import convert2spark
data_frame = train.drop(columns=['timestamp'])
interactions = convert2spark(data_frame)
feature_schema = FeatureSchema(
    [
        FeatureInfo(
            column="user_id",
            feature_type=FeatureType.NUMERICAL,
            feature_hint=FeatureHint.QUERY_ID,
        ),
        FeatureInfo(
            column="item_id",
            feature_type=FeatureType.NUMERICAL,
            feature_hint=FeatureHint.ITEM_ID,
        ),
        FeatureInfo(
            column="rating",
            feature_type=FeatureType.NUMERICAL,
            feature_hint=FeatureHint.RATING,
        ),
    ]
)
dataset = Dataset(feature_schema, interactions)
model = ItemKNN(use_rating = True, num_neighbours=20)
model.fit(dataset)
model.predict(dataset, k=2, queries=[1,2,3,4], items=[1,2,3]).toPandas().sort_values(["user_id","rating","item_id"],
ascending=[True,False,True]).reset_index(drop=True)



02-Nov-25 23:59:52, replay, INFO: ItemKNN model can't predict cold items, they will be ignored


Unnamed: 0,user_id,item_id,rating


In [18]:
from replay.metrics import HitRate, NDCG, Coverage, MAP, MRR
from replay.metrics.base_metric import OfflineMetrics

# Выбираем метрики (k — глубина рекомендаций)
metrics = [
    HitRate(k=10),
    NDCG(k=10),
    MAP(k=10),
    MRR(k=10),
    Coverage(k=10)  # доля уникальных айтемов, рекомендованных хотя бы раз
]

# Создаём эксперимент
experiment = OfflineMetrics(
    test_data=test,
    recommendations=model,
    metrics=metrics,
    train_data=train  # обязательно для filter_seen!
)

ImportError: cannot import name 'OfflineMetrics' from 'replay.metrics.base_metric' (c:\Users\mypc\Code\venv\Lib\site-packages\replay\metrics\base_metric.py)

In [None]:
recs = model.predict(
    users=test["100"].unique(),  # или любые user_id
    k=10,
    items_to_predict=None,           # можно ограничить пул айтемов
    filter_seen_items=True           # не рекомендовать уже взаимодействовавшие
)

print(recs.head())
# Вывод: user_id | item_id | relevance

# Дальше не работает

In [None]:
from src.r_itemkNN import ItemKNN as r_ItemkNN

model = r_ItemkNN(k_neighbours=10, filter_seen=True)
model.fit(train)

# Предсказать для всех юзеров из test
pred = model.predict(users=test['user_id'].unique(), k=10)

# Оценить метрики через replay
from replay.metrics import NDCG, HitRate, Coverage
metrics = [HitRate(topk=10), NDCG(topk=10), Coverage(topk=10)]
result = OfflineMetrics(metrics, query_column='user_id')(pred, test, train)
print(result)

{'HitRate@10': 0.0035175879396984926, 'NDCG@10': 0.0007118423070190877, 'Coverage@10': 0.46639717028802424}


In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Фильтрация: оставляем пользователей с ≥20 взаимодействий
from replay.preprocessing.filters import MinCountFilter
data = MinCountFilter(num_entries=20).transform(data)

# Разбиение по времени
from replay.splitters.time_splitter import TimeSplitter
train, test = TimeSplitter(
    time_threshold=0.2,
    drop_cold_users=True,
    drop_cold_items=True,
    query_column='user_id'
).split(data)

# Оставляем только "позитивные" взаимодействия в тесте (оценка ≥4)
from replay.preprocessing.filters import LowRatingFilter
test = LowRatingFilter(value=4).transform(test)
test = test[test['user_id'].isin(train['user_id'].unique())]

# --- Используем встроенную модель ItemKNN из RePlay ---
from replay.models import ItemKNN

# Инициализация модели: косинусное сходство ≈ use_rating=False + нормализация по популярности
model = ItemKNN(
    num_neighbours=10,
    use_rating=False,      # работаем с бинарными взаимодействиями
    weighting=None,        # или 'tf_idf', 'bm25' — но не 'cosine'
    shrink=0               # регуляризация (по умолчанию 0)
)

# Обучение модели
model.fit(train)

# Предсказание: топ-10 рекомендаций для пользователей из теста
recs = model.predict(
    k=10,
    users=test["user_id"].unique(),
    items=train["item_id"].unique()
)

# --- Оценка качества ---
from replay.metrics import HitRate, NDCG, Coverage, OfflineMetrics

metrics = [HitRate(topk=10), NDCG(topk=10), Coverage(topk=10)]
evaluator = OfflineMetrics(metrics, query_column='user_id')
results = evaluator(recommendations=recs, ground_truth=test, train=train)

print("Результаты ItemKNN (RePlay):")
print(results)