In [1]:
import csv
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import time  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("Available GPUs:", torch.cuda.device_count())

def read_csv_to_list_rating(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  
        matrix = [[int(row[0]), int(row[1]), float(row[2]), int(row[3])] for row in reader]
        return matrix

def read_csv_to_list_movies(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  
        matrix = [[0, "", ""]] 
        index = 1
        for values in reader:
            while index < int(values[0]):
                matrix.append([0, "", ""])
                index += 1
            matrix.append([int(values[0]), str(values[1]), str(values[-1])])
            index += 1
        return matrix

matrixMovies = read_csv_to_list_movies('movies.csv')
matrixrating = read_csv_to_list_rating('ratings.csv')

totalRaters = int(matrixrating[-1][0]) + 1
totalMovies = int(matrixMovies[-1][0]) + 1
print("Total users:", totalRaters)
print("Total movies:", totalMovies)

ratings = pd.read_csv('ratings.csv', sep=',', names=['userId', 'movieId', 'rating', 'timestamp'], skiprows=1)
ratings.drop('timestamp', axis=1, inplace=True)

movies = pd.read_csv('movies.csv', sep=',', names=['movieId', 'title', 'genres'], skiprows=1)

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
ratings['user'] = user_encoder.fit_transform(ratings['userId'])
ratings['movie'] = item_encoder.fit_transform(ratings['movieId'])

num_users = ratings['user'].nunique()
num_items = ratings['movie'].nunique()


Using device: cuda
Available GPUs: 4
Total users: 200949
Total movies: 292758


In [2]:

class MovieLensDataset(Dataset):
    def __init__(self, ratings):
        self.users = torch.tensor(ratings['user'].values, dtype=torch.long)
        self.items = torch.tensor(ratings['movie'].values, dtype=torch.long)
        self.labels = torch.tensor(ratings['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

dataset = MovieLensDataset(ratings)
dataloader = DataLoader(dataset, batch_size=2048, shuffle=True)

class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # BCEWithLogitsLoss expects raw scores
        )

    def forward(self, user_ids, item_ids):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.mlp(x)

def main():
    model = NeuralCollaborativeFiltering(num_users, num_items)
    model = nn.DataParallel(model, device_ids=[0, 1, 2, 3])
    model = model.cuda()

    

    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
    scaler = torch.amp.GradScaler()




    num_epochs = 10
    for epoch in range(num_epochs):
        epoch_start = time.time()  
        model.train()
        running_loss = 0.0
        for user_ids, item_ids, labels in dataloader:
            user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
            labels = (labels >= 4).float() 

            optimizer.zero_grad()
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(user_ids, item_ids).squeeze()
                loss = loss_fn(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()
        epoch_time = time.time() - epoch_start
        print(f"Epoch {epoch+1}: Loss = {running_loss / len(dataloader):.4f} | Time: {epoch_time:.2f} sec")


    model.eval()
    user_id = torch.tensor([1]).to(device)
    candidate_movie_ids = torch.arange(num_items).to(device)
    user_ids = user_id.expand_as(candidate_movie_ids)

    with torch.no_grad():
        predictions = model(user_ids, candidate_movie_ids).squeeze()
        scores = torch.sigmoid(predictions)

    topk = torch.topk(scores, 5)
    top_movie_indices = topk.indices.cpu().numpy()
    top_movie_ids = item_encoder.inverse_transform(top_movie_indices)

    print("\nTop 5 Recommended Movies:")
    for movieId in top_movie_ids:
        title = movies[movies['movieId'] == movieId]['title'].values[0]
        print(f"MovieId {movieId}: {title}")

if __name__ == "__main__":
    main()


Epoch 1: Loss = 0.5630 | Time: 406.20 sec
Epoch 2: Loss = 0.5287 | Time: 406.53 sec
Epoch 3: Loss = 0.5196 | Time: 406.26 sec
Epoch 4: Loss = 0.5149 | Time: 402.95 sec
Epoch 5: Loss = 0.5110 | Time: 405.36 sec
Epoch 6: Loss = 0.5064 | Time: 405.90 sec
Epoch 7: Loss = 0.5020 | Time: 401.91 sec
Epoch 8: Loss = 0.4980 | Time: 405.29 sec
Epoch 9: Loss = 0.4930 | Time: 404.88 sec
Epoch 10: Loss = 0.4861 | Time: 401.32 sec

Top 5 Recommended Movies:
MovieId 135492: Kaaka Muttai (2015)
MovieId 106212: Root of All Evil? (2006)
MovieId 206162: Our Planet: Behind The Scenes (2019)
MovieId 59547: Ron Clark Story, The (2006)
MovieId 219561: Daniel Sloss: X (2019)
