In [67]:
from datetime import datetime

import matplotlib.pyplot  as plt
import numpy as np
import pandas as pd
import seaborn as sns
from replay.metrics import HitRate, NDCG, Coverage, OfflineMetrics
from replay.preprocessing.filters import MinCountFilter, LowRatingFilter
from replay.splitters.time_splitter import TimeSplitter
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import os
if "HOME" not in os.environ and "USERPROFILE" in os.environ:
    os.environ["HOME"] = os.environ["USERPROFILE"]
    os.environ["HADOOP_HOME"] = r"C:\hadoop"

In [68]:
data = pd.read_csv('interaction_data.csv')
itemdata = pd.read_csv('edadata.csv').drop(columns=['artist_name', 'track_name'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178649 entries, 0 to 178648
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    178649 non-null  int64 
 1   item_id    178649 non-null  int64 
 2   rating     178649 non-null  int64 
 3   timestamp  178649 non-null  object
dtypes: int64(3), object(1)
memory usage: 5.5+ MB


In [69]:
data_relevant = data[data['rating'] >= 3].copy()
data_relevant['rating'] = 1
data_relevant.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,2518,1,2025-10-24 20:30:48.409183159
1,0,1740,1,2025-04-20 11:03:28.476727650
2,0,58,1,2025-07-08 13:56:59.123155307
3,0,985,1,2025-03-28 01:39:52.441293532
4,0,1016,1,2025-03-17 20:40:58.400774464


In [70]:
train, test = TimeSplitter(time_threshold=0.2,
                           drop_cold_users=True,
                           drop_cold_items=True,
                           query_column='user_id').split(data_relevant)

In [71]:
all_items = set(itemdata['item_id'])
interacted_items = set(data['item_id'])
missing = all_items - interacted_items
print(f"Не покрыто треков: {len(missing)}")

Не покрыто треков: 0


In [72]:
print('Юзеров в train = {}'.format(train.user_id.nunique()))
print('Айтемов в train = {}'.format(train.item_id.nunique()))

Юзеров в train = 4500
Айтемов в train = 4294


In [73]:
from src.r_itemkNN import ItemKNN

model = ItemKNN()
model.fit(train)

<src.r_itemkNN.ItemKNN at 0x1a693b0ff10>

In [74]:
predict = model.predict(test.user_id.unique())

In [75]:
metrics = [HitRate(topk=10), NDCG(topk=10), Coverage(topk=10)]
OfflineMetrics(metrics, query_column='user_id')(predict, test, train)

{'HitRate@10': 0.20315976858032933,
 'NDCG@10': 0.043188696407590516,
 'Coverage@10': 0.9748486259897532}

In [None]:
from src.r_slim import Slim

model = Slim()
model.fit(train)

In [None]:
metrics = [HitRate(topk=10), NDCG(topk=10), Coverage(topk=10)]
OfflineMetrics(metrics, query_column='user_id')(predict, test, train)

In [None]:
sns.heatmap(model.X @ model.W, cmap='Greys');