In [17]:
import numpy as np
import pandas as pd
import lightfm
from lightfm import data as ld
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from lightfm import data as ld
import lightfm

In [2]:
wsl_train_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/train.csv"
wsl_test_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/test.csv"
train_data = pd.read_csv(wsl_train_path)
test_data = pd.read_csv(wsl_test_path)

In [3]:
user_id_map = {id: i for i, id in enumerate(train_data['user'].unique())}
train_data["user_id"] = train_data["user"].map(user_id_map).map(int)
test_data["user_id"] = test_data["user"].map(user_id_map).map(int)

In [4]:
users_id = sorted(set(train_data["user_id"]))
max_track_id = max(train_data['track'].max(), test_data['track'].max())

In [5]:
positives = train_data[train_data["time"] > 0.7].copy()

In [6]:
dataset = ld.Dataset()
dataset.fit(users_id, range(max_track_id + 1))

all_interactions, _ = dataset.build_interactions(positives[['user_id', 'track']].itertuples(index=False, name=None))

In [7]:
model = lightfm.LightFM(
    no_components=300,
    loss='warp',
    learning_rate=0.01,
    max_sampled=90,
    user_alpha=0.0001,
    item_alpha=0.0001
)

In [8]:
model.fit(all_interactions, epochs=400, verbose=True, num_threads=8)    

Epoch: 100%|██████████| 400/400 [15:17<00:00,  2.29s/it]


<lightfm.lightfm.LightFM at 0x7f35ea138100>

In [9]:
train_data["lightfm_score"] = model.predict(train_data["user_id"].values, train_data["track"].values)
test_data["lightfm_score"] = model.predict(test_data["user_id"].values, test_data["track"].values)

In [10]:
tr_d = train_data
tes_d = test_data

In [34]:
class NCFDataset(Dataset):
    def __init__(self, data):
        self.users = data['user_id'].values
        self.tracks = data['track'].values
        self.lightfm_scores = data['lightfm_score'].values
        self.targets = (data['time'] > 0.7).astype(float).values if 'time' in data.columns else None

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        item = {
            'user': self.users[idx],
            'track': self.tracks[idx],
            'lightfm_score': self.lightfm_scores[idx]
        }
        if self.targets is not None:
            item['target'] = self.targets[idx]
        return item

In [35]:
class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_tracks, embedding_dim=64):
        super(NeuralCollaborativeFiltering, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.track_embedding = nn.Embedding(num_tracks, embedding_dim)
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2 + 1, 128),  # +1 для lightfm_score
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, user_ids, track_ids, lightfm_scores):
        user_embeds = self.user_embedding(user_ids)
        track_embeds = self.track_embedding(track_ids)
        combined = torch.cat([user_embeds, track_embeds, lightfm_scores.unsqueeze(1)], dim=-1)
        return self.fc_layers(combined).squeeze()

In [None]:
BATCH_SIZE=2024

In [36]:
train_dataset = NCFDataset(train_data)
test_dataset = NCFDataset(test_data)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [37]:
num_users = len(user_id_map)
num_tracks = max_track_id + 1
model_ncf = NeuralCollaborativeFiltering(num_users=num_users, num_tracks=num_tracks)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_ncf.parameters(), lr=0.001)

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_ncf.to(device)

NeuralCollaborativeFiltering(
  (user_embedding): Embedding(10000, 64)
  (track_embedding): Embedding(50000, 64)
  (fc_layers): Sequential(
    (0): Linear(in_features=129, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [40]:
for epoch in range(4):
    model_ncf.train()
    total_loss = 0
    curr_batch = 1
    for batch in train_loader:
        user_ids = batch['user'].to(device)
        track_ids = batch['track'].to(device)
        lightfm_scores = batch['lightfm_score'].to(device).float()
        targets = batch['target'].to(device).float()

        optimizer.zero_grad()
        predictions = model_ncf(user_ids, track_ids, lightfm_scores)
        loss = criterion(predictions, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        if curr_batch % 100 == 0:
            print(curr_batch)
        curr_batch += 1

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

100
200
300
400
500
Epoch 1, Loss: 0.015961803596094934
100
200
300
400
500
Epoch 2, Loss: 0.005440759777233018
100
200
300
400
500
Epoch 3, Loss: 0.0034952991956656446
100
200
300
400
500
Epoch 4, Loss: 0.0024841065957407543


In [41]:
model_ncf.eval()
predictions = []

In [42]:
with torch.no_grad():
    for batch in test_loader:
        user_ids = batch['user'].to(device)
        track_ids = batch['track'].to(device)
        lightfm_scores = batch['lightfm_score'].to(device)

        preds = model_ncf(user_ids, track_ids, lightfm_scores).cpu().numpy()
        predictions.extend(preds)

In [43]:
test_data["score"] = predictions

In [44]:
test_data[["user", "track", "score"]].to_csv("./data/neural/test_ncf_1.csv", index=False)