In [1]:
import kaggle # Kaggle API for downloading datasets
import os # os for file operations
import zipfile # Extracting zip files
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder # Data handling libraries
import torch # Deep learning library
from torch.utils.data import Dataset, DataLoader # PyTorch utilities for data loading
import torch.nn as nn
from sklearn.model_selection import train_test_split



In [2]:
# Set the Kaggle API key environmental variable before running the command
# Downloads dataset from Kaggle 

os.system('kaggle datasets download -d arashnic/book-recommendation-dataset')

# Unzip the downloaded dataset

with zipfile.ZipFile('book-recommendation-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('book-recommendation-dataset')

# Remove the zip file after extraction
os.remove('book-recommendation-dataset.zip')


Dataset URL: https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset
License(s): CC0-1.0
Downloading book-recommendation-dataset.zip to /Users/alexcory/RecSys



100%|██████████| 24.3M/24.3M [00:00<00:00, 767MB/s]


In [3]:

# Load the dataset into a pandas DataFrame
raw_books = pd.read_csv('book-recommendation-dataset/Books.csv')
raw_users = pd.read_csv('book-recommendation-dataset/Users.csv')
raw_ratings = pd.read_csv('book-recommendation-dataset/Ratings.csv')


  raw_books = pd.read_csv('book-recommendation-dataset/Books.csv')


In [4]:
raw_books.head() 
raw_users.head()
raw_ratings.head()  

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
ratings = raw_ratings.copy()
ratings = ratings[ratings['Book-Rating'] > 0] # Filter out ratings with a rating of 0 - Not read
ratings['Book-Rating'] = ratings['Book-Rating'] / 10.0 # Normalize ratings to a scale of 0 to 1

train_df, test_df = train_test_split(
    ratings,
    test_size=0.2,
    random_state=5
)

In [6]:
user_enc = LabelEncoder()
item_enc = LabelEncoder()

train_df['user'] = user_enc.fit_transform(train_df['User-ID'])
train_df['item'] = item_enc.fit_transform(train_df['ISBN'])

test_df = test_df[
    test_df['User-ID'].isin(user_enc.classes_) & 
    test_df['ISBN'].isin(item_enc.classes_)
]
test_df['user'] = user_enc.transform(test_df['User-ID'])
test_df['item'] = item_enc.transform(test_df['ISBN'])


num_users = len(user_enc.classes_)
num_items = len(item_enc.classes_)

In [7]:
class RatingsDataset(Dataset):
    def __init__(self, ratings):
        self.users = torch.tensor(ratings['user'].values, dtype=torch.long)
        self.items = torch.tensor(ratings['item'].values, dtype=torch.long)
        self.ratings = torch.tensor(ratings['Book-Rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_dataset = RatingsDataset(train_df)
test_dataset = RatingsDataset(test_df)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)


In [8]:

class NeuralCF(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=64):
        super().__init__()
        self.user_embed = nn.Embedding(num_users, embed_dim)
        self.item_embed = nn.Embedding(num_items, embed_dim)

        self.fc_layers = nn.Sequential(
            nn.Linear(embed_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_vec = self.user_embed(user)
        item_vec = self.item_embed(item)
        x = torch.cat([user_vec, item_vec], dim=1)
        return self.fc_layers(x).squeeze()

In [9]:
model = NeuralCF(num_users, num_items)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

best_loss = float('inf')
for epoch in range(10):
    # Training phase
    model.train()
    train_loss = 0
    for user, item, rating in train_loader:
        optimizer.zero_grad()
        pred = model(user, item)
        loss = criterion(pred, rating)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(rating)
    
    avg_train_loss = train_loss / len(train_loader.dataset)
    
    # Evaluation phase
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for user, item, rating in test_loader:
            pred = model(user, item)
            loss = criterion(pred, rating)
            test_loss += loss.item() * len(rating)
    
    avg_test_loss = test_loss / len(test_loader.dataset)
    
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Test Loss: {avg_test_loss:.4f}")
    
    if avg_test_loss < best_loss:
        best_loss = avg_test_loss
        torch.save(model.state_dict(), "best_model.pt")
        print(f"Saved new best model with test loss: {best_loss:.4f}")


Epoch 1 | Train Loss: 0.0392 | Test Loss: 0.0309
Saved new best model with test loss: 0.0309
Epoch 2 | Train Loss: 0.0312 | Test Loss: 0.0293
Saved new best model with test loss: 0.0293
Epoch 3 | Train Loss: 0.0290 | Test Loss: 0.0300
Epoch 4 | Train Loss: 0.0272 | Test Loss: 0.0282
Saved new best model with test loss: 0.0282
Epoch 5 | Train Loss: 0.0254 | Test Loss: 0.0276
Saved new best model with test loss: 0.0276
Epoch 6 | Train Loss: 0.0237 | Test Loss: 0.0276
Saved new best model with test loss: 0.0276
Epoch 7 | Train Loss: 0.0219 | Test Loss: 0.0277
Epoch 8 | Train Loss: 0.0202 | Test Loss: 0.0285
Epoch 9 | Train Loss: 0.0186 | Test Loss: 0.0287
Epoch 10 | Train Loss: 0.0172 | Test Loss: 0.0291


In [10]:
def recommend_for_user(user_id, top_n=10):
    model.load_state_dict(torch.load("best_model.pt"))
    model.eval()
    
    # Handle unknown users
    if user_id not in user_enc.classes_:
        return f"User {user_id} not seen during training"
    
    user_idx = user_enc.transform([user_id])[0]
    item_indices = torch.arange(num_items)
    user_tensor = torch.full_like(item_indices, user_idx)

    with torch.no_grad():
        preds = model(user_tensor, item_indices)

    top_items = torch.topk(preds, top_n).indices
    recommended_book_ids = item_enc.inverse_transform(top_items.numpy())
    return recommended_book_ids