In [1]:
from sklearn.preprocessing import LabelEncoder
import csv
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import time  


In [2]:

def read_csv_to_list_rating(file_path):
    """Read a CSV file into a list of lists with specific column types."""
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Read the header
        matrix = []
        for values in reader:
            row = [int(values[0]), int(values[1]), float(values[2]), int(values[3])]
            matrix.append(row)
        return header, matrix

def read_csv_to_list_movies(file_path):
    """Read a CSV file into a list of lists with specific column types."""
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Read the header
        matrix = []
        index = 1
        matrix.append([0, "", ""])  # Initial dummy row
        for values in reader:
            while index < int(values[0]):
                matrix.append([0, "", ""])
                index += 1
            row = [int(values[0]), str(values[1]), str(values[-1])]
            matrix.append(row)
            index += 1
        return header, matrix

    

In [3]:
headerMovies, matrixMovies = read_csv_to_list_movies('movies.csv')
headerRating, matrixrating = read_csv_to_list_rating('ratings.csv')


totalRaters = int(matrixrating[-1][0])+1  # total rows
totalMovies = int(matrixMovies[-1][0])+1
print(totalRaters)
print(totalMovies)

200949
292758


In [4]:
ratings = pd.read_csv(
    'ratings.csv',
    sep=',',
    names=['userId', 'movieId', 'rating', 'timestamp'],
    dtype={'userId': int, 'movieId': int, 'rating': float, 'timestamp': int},
    skiprows=1  
)
ratings = ratings.drop('timestamp', axis=1)

In [5]:
movies = pd.read_csv(
    'movies.csv',
    sep=',',
    names=['movieId', 'title', 'genres'],
    dtype={'movieId': int, 'title': str, 'genres': str},
    skiprows=1  
)


In [6]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings['user'] = user_encoder.fit_transform(ratings['userId'])
ratings['movie'] = item_encoder.fit_transform(ratings['movieId'])

num_users = ratings['user'].nunique()
num_items = ratings['movie'].nunique()

In [7]:


class MovieLensDataset(Dataset):
    def __init__(self, ratings):
        self.users = torch.tensor(ratings['user'].values, dtype=torch.long)
        self.items = torch.tensor(ratings['movie'].values, dtype=torch.long)
        self.labels = torch.tensor(ratings['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

In [8]:

dataset = MovieLensDataset(ratings)
dataloader = DataLoader(dataset, batch_size=2048, shuffle=True)


In [9]:
import torch.nn as nn

class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Predicts single score
        )

    def forward(self, user_ids, item_ids):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        x = torch.cat([user_emb, item_emb], dim=-1)
        x = self.mlp(x)
        return x  # <-- NO SIGMOID



In [10]:
model = NeuralCollaborativeFiltering(num_users, num_items).cuda()

In [11]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)


In [12]:
scaler = torch.amp.GradScaler(device='cuda')


In [13]:
num_epochs = 10

for epoch in range(num_epochs):
    epoch_start = time.time()
    model.train()
    running_loss = 0.0
    for user_ids, item_ids, labels in dataloader:
        user_ids, item_ids, labels = user_ids.cuda(), item_ids.cuda(), labels.cuda()
        labels = (labels >= 4).float()  # Treat ratings >= 4 as "positive"

        optimizer.zero_grad()
        with torch.autocast(device_type='cuda'):
            outputs = model(user_ids, item_ids).squeeze()
            loss = loss_fn(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
    #epoch_start = time.time()
    epoch_time = time.time() - epoch_start
    print(f"Epoch {epoch+1}: Loss = {running_loss / len(dataloader):.4f} | Time: {epoch_time:.2f} sec")


Epoch 1: Loss = 0.5631 | Time: 289.61 sec


KeyboardInterrupt: 

In [15]:
model.eval()  # Switch to evaluation mode

user_id = torch.tensor([1]).cuda()  # Some user
candidate_movie_ids = torch.arange(num_items).cuda()  # All movies

# Expand user_id to match movie_ids
user_ids = user_id.expand_as(candidate_movie_ids)

with torch.no_grad():
    predictions = model(user_ids, candidate_movie_ids).squeeze()
    scores = torch.sigmoid(predictions)  # Now apply sigmoid manually in evaluation

# Recommend top 5 movies
topk = torch.topk(scores, 5)
print("Top 5 recommended movie IDs:", topk.indices.cpu().numpy())


Top 5 recommended movie IDs: [59308 71289 51288 44956 36933]


In [16]:

# Load movie metadata
movies = pd.read_csv('movies.csv')

# Your LabelEncoder for items (movieId)
# item_encoder = LabelEncoder()
# item_encoder.fit(ratings['movieId'])  # you did this earlier

# Your top recommended internal IDs
top_movie_indices = topk.indices.cpu().numpy()  # [591, 536, 600, 734, 658]

# Decode back to original MovieLens movieIds
top_movie_ids = item_encoder.inverse_transform(top_movie_indices)

# Now match with movie titles
recommended_movies = movies[movies['movieId'].isin(top_movie_ids)]

# Print nicely
for movieId in top_movie_ids:
    title = movies[movies['movieId'] == movieId]['title'].values[0]
    print(f"MovieId {movieId}: {title}")


MovieId 203847: Kumbalangi Nights (2019)
MovieId 242210: James Acaster: Cold Lasagne Hate Myself 1999 (2020)
MovieId 186363: The China Hustle (2018)
MovieId 172587: Vacations in Prostokvashino (1980)
MovieId 153828: Waiter, Scarper! (1981)


In [17]:
#CPU

In [2]:
from sklearn.preprocessing import LabelEncoder
import csv
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import time
# Select device
device = torch.device("cpu")  # Change to "cuda" for GPU later

# Read ratings
def read_csv_to_list_rating(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)
        matrix = []
        for values in reader:
            row = [int(values[0]), int(values[1]), float(values[2]), int(values[3])]
            matrix.append(row)
        return header, matrix

# Read movies
def read_csv_to_list_movies(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)
        matrix = []
        index = 1
        matrix.append([0, "", ""])
        for values in reader:
            while index < int(values[0]):
                matrix.append([0, "", ""])
                index += 1
            row = [int(values[0]), str(values[1]), str(values[-1])]
            matrix.append(row)
            index += 1
        return header, matrix

headerMovies, matrixMovies = read_csv_to_list_movies('movies.csv')
headerRating, matrixrating = read_csv_to_list_rating('ratings.csv')

totalRaters = int(matrixrating[-1][0]) + 1
totalMovies = int(matrixMovies[-1][0]) + 1
print(totalRaters)
print(totalMovies)

ratings = pd.read_csv(
    'ratings.csv',
    sep=',',
    names=['userId', 'movieId', 'rating', 'timestamp'],
    dtype={'userId': int, 'movieId': int, 'rating': float, 'timestamp': int},
    skiprows=1
).drop('timestamp', axis=1)

movies = pd.read_csv(
    'movies.csv',
    sep=',',
    names=['movieId', 'title', 'genres'],
    dtype={'movieId': int, 'title': str, 'genres': str},
    skiprows=1
)

# Encode users and items
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings['user'] = user_encoder.fit_transform(ratings['userId'])
ratings['movie'] = item_encoder.fit_transform(ratings['movieId'])

num_users = ratings['user'].nunique()
num_items = ratings['movie'].nunique()

# Dataset class
class MovieLensDataset(Dataset):
    def __init__(self, ratings):
        self.users = torch.tensor(ratings['user'].values, dtype=torch.long)
        self.items = torch.tensor(ratings['movie'].values, dtype=torch.long)
        self.labels = torch.tensor(ratings['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

dataset = MovieLensDataset(ratings)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

# Model class
class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, user_ids, item_ids):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.mlp(x)

model = NeuralCollaborativeFiltering(num_users, num_items).to(device)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print("Start")
    model.train()
    running_loss = 0.0
    epoch_start = time.time()
    for user_ids, item_ids, labels in dataloader:
        user_ids = user_ids.to(device)
        item_ids = item_ids.to(device)
        labels = (labels >= 4).float().to(device)

        optimizer.zero_grad()
        outputs = model(user_ids, item_ids).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    
    epoch_time = time.time() - epoch_start
    print(f"Epoch {epoch+1}: Loss = {running_loss / len(dataloader):.4f} | Time: {epoch_time:.2f} sec")

# Evaluation
model.eval()
user_id = torch.tensor([1]).to(device)
candidate_movie_ids = torch.arange(num_items).to(device)
user_ids = user_id.expand_as(candidate_movie_ids)

with torch.no_grad():
    predictions = model(user_ids, candidate_movie_ids).squeeze()
    scores = torch.sigmoid(predictions)

topk = torch.topk(scores, 5)
print("Top 5 recommended movie IDs:", topk.indices.cpu().numpy())

top_movie_indices = topk.indices.cpu().numpy()
top_movie_ids = item_encoder.inverse_transform(top_movie_indices)

# Print recommended titles
for movieId in top_movie_ids:
    title = movies[movies['movieId'] == movieId]['title'].values[0]
    print(f"MovieId {movieId}: {title}")


200949
292758
Start
Epoch 1: Loss = 0.5585 | Time: 671.93 sec
Start


KeyboardInterrupt: 