In [58]:
from datetime import datetime

import matplotlib.pyplot  as plt
import numpy as np
import pandas as pd
import seaborn as sns
from replay.metrics import HitRate, NDCG, Coverage, OfflineMetrics
from replay.preprocessing.filters import MinCountFilter, LowRatingFilter
from replay.splitters.time_splitter import TimeSplitter
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import os
if "HOME" not in os.environ and "USERPROFILE" in os.environ:
    os.environ["HOME"] = os.environ["USERPROFILE"]
    os.environ["HADOOP_HOME"] = r"C:\hadoop"

In [59]:
data = pd.read_csv('interaction_data.csv')
itemdata = pd.read_csv('edadata.csv').drop(columns=['artist_name', 'track_name'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178286 entries, 0 to 178285
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    178286 non-null  int64 
 1   item_id    178286 non-null  int64 
 2   rating     178286 non-null  int64 
 3   timestamp  178286 non-null  object
dtypes: int64(3), object(1)
memory usage: 5.4+ MB


In [60]:
data_relevant = data[data['rating'] >= 3].copy()
data_relevant['rating'] = 1
data_relevant.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,2027,1,2025-05-13 11:18:57.004464816
1,0,35,1,2025-08-08 15:04:50.048190407
2,0,4076,1,2025-06-06 21:45:56.973056212
3,0,2518,1,2025-07-14 00:07:31.081573550
4,0,1585,1,2025-08-11 23:05:33.987562101


In [61]:
train, test = TimeSplitter(time_threshold=0.2,
                           drop_cold_users=True,
                           drop_cold_items=True,
                           query_column='user_id').split(data_relevant)

In [62]:
all_items = set(itemdata['item_id'])
interacted_items = set(data['item_id'])
missing = all_items - interacted_items
print(f"Не покрыто треков: {len(missing)}")

Не покрыто треков: 0


In [63]:
print('Юзеров в train = {}'.format(train.user_id.nunique()))
print('Айтемов в train = {}'.format(train.item_id.nunique()))

Юзеров в train = 4500
Айтемов в train = 4294


In [64]:
from src.r_itemkNN import ItemKNN

model = ItemKNN()
model.fit(train)

<src.r_itemkNN.ItemKNN at 0x1a693d20e50>

In [65]:
predict = model.predict(test.user_id.unique())

In [66]:
metrics = [HitRate(topk=10), NDCG(topk=10), Coverage(topk=10)]
OfflineMetrics(metrics, query_column='user_id')(predict, test, train)

{'HitRate@10': 0.20475767007558915,
 'NDCG@10': 0.04214237136356075,
 'Coverage@10': 0.9734513274336283}

In [None]:
from src.r_slim import Slim

model = Slim()
model.fit(train)

In [None]:
metrics = [HitRate(topk=10), NDCG(topk=10), Coverage(topk=10)]
OfflineMetrics(metrics, query_column='user_id')(predict, test, train)

In [None]:
sns.heatmap(model.X @ model.W, cmap='Greys');