In [11]:
import surprise
from surprise import SVDpp, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate

In [2]:
import pandas as pd
import numpy as np

import scipy.stats as ss
from sklearn.metrics import ndcg_score

In [3]:
wsl_train_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/train.csv"
wsl_test_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/test.csv"
train_data = pd.read_csv(wsl_train_path)
test_data = pd.read_csv(wsl_test_path)

In [4]:
user_id_map = {id: i for i, id in enumerate(train_data['user'].unique())}

In [5]:
train_data["user_id"] = train_data["user"].map(user_id_map).map(int)
test_data["user_id"] = test_data["user"].map(user_id_map).map(int)

In [21]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(train_data[['user_id', 'track', 'time']], reader)

In [24]:
model = SVDpp(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02, verbose=True)

In [25]:
model.fit(data.build_full_trainset())

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fc6c9272790>

In [None]:
# predictions = model.predict(svd, test, usercol="userID", itemcol="itemID")
# predictions.head()

In [27]:
predictions = [
    model.predict(uid, iid).est
    for _, (uid, iid) in test_data[['user_id', 'track']].iterrows()
]

In [29]:
np.count_nonzero(np.array(predictions)) / len(predictions)

0.989779197902585

In [30]:
test_data['score'] = predictions
test_data[["user", "track", "score"]].to_csv("./data/svd/test_svd_1.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from surprise import SVDpp, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.accuracy import ndcg_at_k

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Подготовка данных для Surprise
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(train_df[['user', 'track', 'time']], reader)

# Разделение данных на обучающую и валидационную выборки
trainset, validset = surprise_train_test_split(data, test_size=0.2, random_state=42)

# Инициализация и обучение модели SVD++
model = SVDpp(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
model.fit(trainset)

# Функция для получения рекомендаций
def get_recommendations(model, user_id, n=10):
    user_tracks = train_df[train_df['user'] == user_id]['track'].unique()
    all_tracks = train_df['track'].unique()
    tracks_to_predict = np.setdiff1d(all_tracks, user_tracks)
    
    predictions = [model.predict(user_id, track) for track in tracks_to_predict]
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    return [pred.iid for pred in predictions[:n]]

# Генерация рекомендаций для тестовых пользователей
test_users = test_df['user'].unique()
recommendations = {}

for user in test_users:
    recommendations[user] = get_recommendations(model, user)

# Подготовка результатов для отправки
results = []
for _, row in test_df.iterrows():
    user = row['user']
    track = row['track']
    if track in recommendations[user]:
        results.append((user, track, recommendations[user].index(track) + 1))
    else:
        results.append((user, track, len(recommendations[user]) + 1))

submission_df = pd.DataFrame(results, columns=['user', 'track', 'rank'])
submission_df.to_csv('submission.csv', index=False)

# Оценка NDCG на валидационной выборке
def calculate_ndcg(predictions, k=10):
    user_est = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est[uid].append((true_r, est))
    
    ndcg_scores = []
    for uid, user_ratings in user_est.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        ideal = sorted([r for r, _ in user_ratings], reverse=True)
        predicted = [r for r, _ in user_ratings[:k]]
        ndcg_scores.append(ndcg_at_k(ideal, predicted, k))
    
    return np.mean(ndcg_scores)

predictions = model.test(validset)
ndcg_score = calculate_ndcg(predictions)
print(f"NDCG@10 на валидационной выборке: {ndcg_score}")


In [None]:
from surprise import SVDpp, Dataset, Reader
from surprise.model_selection import train_test_split

# Подготовка данных
reader = Reader(rating_scale=(0, max_listening_time))
data = Dataset.load_from_df(df[['user_id', 'track_id', 'listening_time']], reader)

# Разделение на обучающую и тестовую выборки
trainset, testset = train_test_split(data, test_size=0.2)

# Инициализация и обучение модели
model = SVDpp(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
model.fit(trainset)

# Получение рекомендаций
predictions = model.test(testset)
