In [19]:
import pandas as pd
import numpy as np

import scipy.stats as ss
from sklearn.metrics import ndcg_score
import optuna

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

In [20]:
wsl_train_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/train.csv"
wsl_test_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/test.csv"
train_data = pd.read_csv(wsl_train_path)
test_data = pd.read_csv(wsl_test_path)

In [21]:
user_id_map = {id: i for i, id in enumerate(train_data['user'].unique())}

train_data["user_id"] = train_data["user"].map(user_id_map).map(int)
test_data["user_id"] = test_data["user"].map(user_id_map).map(int)

In [22]:
user_item_matrix = csr_matrix((train_data['time'], (train_data['user_id'], train_data['track'])))

In [23]:
train_data["validation"] = np.random.random(len(train_data)) >= 0.8

train_train_data = train_data[~train_data["validation"]]
train_val_data = train_data[train_data["validation"]]

In [24]:
user_item_matrix_train = csr_matrix((train_train_data['time'], (train_train_data['user_id'], train_train_data['track'])))
user_item_matrix_val = csr_matrix((train_val_data['time'], (train_val_data['user_id'], train_val_data['track'])))

In [30]:
def ndcg_at_k(model, user_item_matrix, user_ids, k=10):
    ndcg_scores = []
    for user_id in user_ids:
        if user_id >= model.user_factors.shape[0]:
            continue
        
        # Получаем рекомендации
        recommendations = model.recommend(user_id, user_item_matrix[user_id], N=k, filter_already_liked_items=False)
        recommended_items = recommendations[0]  # Первый элемент - это список рекомендованных айтемов
        
        # Релевантные элементы (например, те, с которыми пользователь взаимодействовал)
        relevant_items = set(user_item_matrix[user_id].indices)
        
        # DCG
        dcg = 0
        for i, item in enumerate(recommended_items):
            if item in relevant_items:
                dcg += 1 / np.log2(i + 2)  # i+2 из-за 1-based индексации
        
        # IDCG
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant_items), k)))
        
        # NDCG
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores)


In [33]:
import optuna
import numpy as np
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

def fit_model(
    factors=100,
    regularization=0.01,
    iterations=50,
    alpha=1.0,
    calculate_training_loss=True,
    num_threads=7
):
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        # alpha=alpha,
        use_native=True,
        calculate_training_loss=calculate_training_loss,
        num_threads=num_threads
    )
    model.fit(user_item_matrix_train, show_progress=False)
    
    val_users = train_val_data['user_id'].unique()
    ndcg_at_10 = ndcg_at_k(model, user_item_matrix_val, val_users, k=10)
    
    return ndcg_at_10
    # 
    # scores = []
    # for _, row in train_val_data.iterrows():
    #     user_id = row['user_id']
    #     track = row['track']
    #     if user_id < model.user_factors.shape[0] and track < model.item_factors.shape[0]:
    #         score = np.dot(model.user_factors[user_id], model.item_factors[track])
    #     else:
    #         score = 0
    #     scores.append(score)
    # 
    # # Здесь можно вычислить NDCG или другую метрику на основе scores
    # # Для примера, просто возвращаем среднее значение scores
    # return np.mean(scores)

def objective(trial):
    factors = trial.suggest_int("factors", 50, 200)
    regularization = trial.suggest_float("regularization", 1e-6, 1e-1, log=True)
    iterations = trial.suggest_int("iterations", 10, 100)
    alpha = trial.suggest_float("alpha", 1e-3, 1e2, log=True)
    
    score = fit_model(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        alpha=alpha,
        calculate_training_loss=True,
        num_threads=7
    )
    
    return score

def objective_2(trial):
    factors_options = [50, 100, 150, 200]
    regularization_options = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    iterations_options = [10, 25, 50, 75, 100]
    alpha_options = [1e-3, 1e-2, 1e-1, 1, 10, 100]

    factors = trial.suggest_categorical("factors", factors_options)
    regularization = trial.suggest_categorical("regularization", regularization_options)
    iterations = trial.suggest_categorical("iterations", iterations_options)
    alpha = trial.suggest_categorical("alpha", alpha_options)
    
    score = fit_model(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        alpha=alpha,
        calculate_training_loss=True,
        num_threads=7
    )
    
    return score


In [34]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=60)

[I 2025-03-19 16:52:26,399] A new study created in memory with name: no-name-3e146211-5189-4c69-ac4b-ed71ff9b28fa
  regularization = trial.suggest_loguniform("regularization", 1e-6, 1e-1)
  alpha = trial.suggest_loguniform("alpha", 1e-3, 1e2)
[I 2025-03-19 16:53:20,253] Trial 0 finished with value: 0.06199358046136507 and parameters: {'factors': 157, 'regularization': 1.0898561756631564e-06, 'iterations': 93, 'alpha': 59.696293376532275}. Best is trial 0 with value: 0.06199358046136507.
[I 2025-03-19 16:53:56,225] Trial 1 finished with value: 0.063667575394459 and parameters: {'factors': 170, 'regularization': 0.002604369881762499, 'iterations': 45, 'alpha': 0.031764782641867176}. Best is trial 1 with value: 0.063667575394459.
[I 2025-03-19 16:54:44,011] Trial 2 finished with value: 0.06163486035627629 and parameters: {'factors': 153, 'regularization': 0.00012057607878305998, 'iterations': 72, 'alpha': 29.6094718617813}. Best is trial 1 with value: 0.063667575394459.
[I 2025-03-19 16:5

KeyboardInterrupt: 

In [None]:
best_params = study.best_params
best_params 

In [None]:
{'factors': 200, 'regularization': 0.019156096075596457, 'iterations': 23, 'alpha': 1.0421270389498198}

In [9]:
# model = AlternatingLeastSquares(factors=100, regularization=0.01, iterations=50, calculate_training_loss=True)
model = AlternatingLeastSquares(**best_params)
model.fit(user_item_matrix)

100%|██████████| 50/50 [00:14<00:00,  3.54it/s, loss=0.000636]


In [15]:
predictions = []
for _, row in test_data.iterrows():
    user_id = row['user_id']
    track = row['track']
    if user_id < model.user_factors.shape[0] and track < model.item_factors.shape[0]:
        user_factor = model.user_factors[user_id]
        item_factor = model.item_factors[track]
        score = np.dot(user_factor, item_factor)
    else:
        score = 0
    predictions.append(score)

In [17]:
test_data['score'] = predictions
test_data[["user", "track", "score"]].to_csv("./data/als/test_als_2.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

# Загрузка данных
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Создание словарей для маппинга пользователей и треков
user_ids = train_data['user'].unique()
track_ids = train_data['track'].unique()
user_map = {id: i for i, id in enumerate(user_ids)}
track_map = {id: i for i, id in enumerate(track_ids)}

# Преобразование данных
train_data['user_id'] = train_data['user'].map(user_map)
train_data['track_id'] = train_data['track'].map(track_map)
test_data['user_id'] = test_data['user'].map(user_map)
test_data['track_id'] = test_data['track'].map(track_map)

# Создание разреженной матрицы взаимодействий
user_item_matrix = csr_matrix((train_data['time'], (train_data['user_id'], train_data['track_id'])))

# Инициализация и обучение модели
model = AlternatingLeastSquares(factors=100, regularization=0.01, iterations=50, calculate_training_loss=True)
model.fit(user_item_matrix)

# Функция для получения рекомендаций
def get_recommendations(model, user_id, n=10):
    user_items = user_item_matrix[user_map[user_id]].toarray()
    recommendations = model.recommend(user_map[user_id], user_items, N=n, filter_already_liked_items=False)
    return [track_ids[track_id] for track_id, _ in recommendations]

# Генерация рекомендаций для тестовых пользователей
test_users = test_data['user'].unique()
recommendations = {}

for user in test_users:
    recommendations[user] = get_recommendations(model, user)

# Подготовка результатов для отправки
results = []
for _, row in test_data.iterrows():
    user = row['user']
    track = row['track']
    if track in recommendations[user]:
        results.append((user, track, recommendations[user].index(track) + 1))
    else:
        results.append((user, track, len(recommendations[user]) + 1))

submission_df = pd.DataFrame(results, columns=['user', 'track', 'rank'])
submission_df.to_csv('submission.csv', index=False)
