In [45]:
import pandas as pd
import numpy as np

books = pd.read_csv(
    'BX-Books.csv',
    sep=';',
    encoding='latin-1',
    engine='python',
    on_bad_lines='skip'
)

users = pd.read_csv(
    'BX-Users.csv',
    sep=';',
    encoding='latin-1',
    engine='python',
    on_bad_lines='skip'
)

ratings = pd.read_csv(
    'BX-Book-Ratings.csv',
    sep=';',
    encoding='latin-1',
    engine='python',
    on_bad_lines='skip'
)


In [46]:
books.drop(columns=['Image-URL-S' , 'Image-URL-M'] , inplace=True)

books.rename(columns={'Book-Title':'title',
                      'Book-Author':'author',
                      'Year-Of-Publication':'year',
                      'Publisher':'publisher',
                      'Image-URL-L':'image-url',
                      'ISBN':'isbn'} , inplace=True)



In [47]:
users.rename(columns={'User-ID':'user_id','Location':'location','Age':'age'} , inplace=True)

In [48]:
ratings.rename(columns={'User-ID':'user_id','ISBN':'isbn','Book-Rating':'rating'} , inplace=True)

In [49]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [50]:
ratings = ratings[ratings['rating'] > 0]
ratings.head()


Unnamed: 0,user_id,isbn,rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6


In [51]:
# Filter users
valid_users = ratings['user_id'].value_counts()
valid_users = valid_users[valid_users >= 5].index
ratings = ratings[ratings['user_id'].isin(valid_users)]

# Filter books
valid_items = ratings['isbn'].value_counts()
valid_items = valid_items[valid_items >= 5].index
ratings = ratings[ratings['isbn'].isin(valid_items)]

ratings.shape


(141081, 3)

In [52]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings['user'] = user_encoder.fit_transform(ratings['user_id'])
ratings['item'] = item_encoder.fit_transform(ratings['isbn'])

num_users = ratings['user'].nunique()
num_items = ratings['item'].nunique()

num_users, num_items


(13030, 11234)

In [53]:
ratings = ratings.sort_values(['user', 'rating'])


In [54]:
# last interaction → test
test_df = ratings.groupby('user').tail(1)

# rest → train
train_df = ratings.drop(test_df.index)

train_df.shape, test_df.shape


((128051, 5), (13030, 5))

In [55]:
import torch

user_tensor = torch.LongTensor(train_df['user'].values)
item_tensor = torch.LongTensor(train_df['item'].values)


In [56]:
from collections import defaultdict

train_interactions = defaultdict(set)

for u, i in zip(train_df['user'], train_df['item']):
    train_interactions[u].add(i)


In [57]:
import numpy as np
from scipy.sparse import csr_matrix

num_users = ratings['user'].nunique()
num_items = ratings['item'].nunique()

# total nodes
num_nodes = num_users + num_items

# shift item IDs so they don't overlap with users
item_ids_shifted = train_df['item'].values + num_users
user_ids = train_df['user'].values

# for bipartite graph: user ↔ item
rows = np.concatenate([user_ids, item_ids_shifted])
cols = np.concatenate([item_ids_shifted, user_ids])

data = np.ones(len(rows))

adj_matrix = csr_matrix((data, (rows, cols)), shape=(num_nodes, num_nodes))

adj_matrix


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 256102 stored elements and shape (24264, 24264)>

In [58]:
import torch
import scipy.sparse as sp

def normalize_adj_matrix(adj):
    # convert to COO format
    adj = adj.tocoo()
    
    # compute degree of each node
    rowsum = np.array(adj.sum(axis=1)).flatten()

    # d^{-1/2}
    d_inv_sqrt = np.power(rowsum, -0.5)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.

    # diagonal matrix
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

    # normalized adjacency
    return d_mat_inv_sqrt @ adj @ d_mat_inv_sqrt

norm_adj_matrix = normalize_adj_matrix(adj_matrix)
norm_adj_matrix


  d_inv_sqrt = np.power(rowsum, -0.5)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 256102 stored elements and shape (24264, 24264)>

In [59]:
def convert_sp_mat_to_sp_tensor(sp_mat):
    sp_mat = sp_mat.tocoo()
    indices = torch.from_numpy(
        np.vstack((sp_mat.row, sp_mat.col)).astype(np.int64)
    )
    values = torch.from_numpy(sp_mat.data.astype(np.float32))
    shape = torch.Size(sp_mat.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

norm_adj_tensor = convert_sp_mat_to_sp_tensor(norm_adj_matrix)
norm_adj_tensor


tensor(indices=tensor([[    1,     1,     1,  ..., 24262, 24263, 24263],
                       [14517, 14525, 14842,  ...,  3531,  6320, 12906]]),
       values=tensor([0.1667, 0.1508, 0.2041,  ..., 0.4082, 0.1000, 0.3162]),
       size=(24264, 24264), nnz=256102, layout=torch.sparse_coo)

In [60]:
import torch
import torch.nn as nn

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, num_layers, norm_adj_matrix):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        
        # user + item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        # initialize embeddings
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)

        # store adjacency
        self.norm_adj_matrix = norm_adj_matrix.coalesce()

    def propagate(self):
        """Perform LightGCN propagation"""
        all_embeddings = []
        
        # initial embeddings (layer 0)
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight
        
        emb = torch.cat([user_emb, item_emb])
        all_embeddings.append(emb)
        
        # message passing
        for layer in range(self.num_layers):
            emb = torch.sparse.mm(self.norm_adj_matrix, emb)
            all_embeddings.append(emb)
        
        # mean of all layers
        final_emb = torch.stack(all_embeddings, dim=1).mean(dim=1)
        
        # split back
        final_user_emb = final_emb[:self.num_users]
        final_item_emb = final_emb[self.num_users:]
        
        return final_user_emb, final_item_emb

    def forward(self, users, pos_items, neg_items):
        user_emb, item_emb = self.propagate()

        u = user_emb[users]
        pos = item_emb[pos_items]
        neg = item_emb[neg_items]
        
        # BPR scoring
        pos_score = torch.sum(u * pos, dim=1)
        neg_score = torch.sum(u * neg, dim=1)

        return pos_score, neg_score


In [61]:
def bpr_loss(pos_scores, neg_scores, lambda_reg, user_emb, pos_emb, neg_emb):
    mf_loss = -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores)))
    
    reg_loss = lambda_reg * (
        user_emb.pow(2).mean() +
        pos_emb.pow(2).mean() +
        neg_emb.pow(2).mean()
    )
    
    return mf_loss + reg_loss


In [62]:
import numpy as np

def negative_sampling(user_ids, train_interactions, num_items):
    neg_items = []
    for u in user_ids:
        while True:
            neg = np.random.randint(0, num_items)
            if neg not in train_interactions[u]:
                neg_items.append(neg)
                break
    return torch.LongTensor(neg_items)


In [69]:
from torch.utils.data import DataLoader, TensorDataset

def train_lightgcn(model, train_df, train_interactions, epochs=15, batch_size=2048, lambda_reg=1e-4, lr=0.001):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    users = torch.LongTensor(train_df['user'].values)
    items = torch.LongTensor(train_df['item'].values)

    dataset = TensorDataset(users, items)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        total_loss = 0
        
        for batch_users, batch_pos_items in loader:
            
            # sample negative items
            batch_neg_items = negative_sampling(batch_users.numpy(), train_interactions, model.num_items)

            # forward pass
            pos_scores, neg_scores = model(batch_users, batch_pos_items, batch_neg_items)

            # get final embeddings for loss regularization
            final_user_emb, final_item_emb = model.propagate()
            u_emb = final_user_emb[batch_users]
            pos_emb = final_item_emb[batch_pos_items]
            neg_emb = final_item_emb[batch_neg_items]

            # loss
            loss = bpr_loss(pos_scores, neg_scores, lambda_reg, u_emb, pos_emb, neg_emb)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs} | Loss = {total_loss:.4f}")


In [71]:
embedding_dim = 64
num_layers = 3

model = LightGCN(
    num_users=num_users,
    num_items=num_items,
    embedding_dim=embedding_dim,
    num_layers=num_layers,
    norm_adj_matrix=norm_adj_tensor
)

train_lightgcn(model, train_df, train_interactions, epochs=17)


Epoch 1/17 | Loss = 43.4322
Epoch 2/17 | Loss = 40.2789
Epoch 3/17 | Loss = 35.8292
Epoch 4/17 | Loss = 33.0690
Epoch 5/17 | Loss = 31.3936
Epoch 6/17 | Loss = 30.1939
Epoch 7/17 | Loss = 29.0846
Epoch 8/17 | Loss = 28.3388
Epoch 9/17 | Loss = 27.3250
Epoch 10/17 | Loss = 26.7936
Epoch 11/17 | Loss = 25.9306
Epoch 12/17 | Loss = 25.3387
Epoch 13/17 | Loss = 24.6165
Epoch 14/17 | Loss = 24.0829
Epoch 15/17 | Loss = 23.4760
Epoch 16/17 | Loss = 22.8542
Epoch 17/17 | Loss = 22.4615


In [72]:
from collections import defaultdict

test_interactions = defaultdict(set)
for u, i in zip(test_df['user'], test_df['item']):
    test_interactions[u].add(i)


In [73]:
user_emb, item_emb = model.propagate()
user_emb = user_emb.detach()
item_emb = item_emb.detach()


In [74]:
import numpy as np
import torch

def recall_at_k(model, user_emb, item_emb, test_interactions, train_interactions, k=10):

    recalls = []

    all_item_ids = torch.arange(item_emb.shape[0])

    for user in test_interactions:
        true_items = test_interactions[user]

        if len(true_items) == 0:
            continue

        # items this user interacted with in training
        train_items = train_interactions[user]

        # score all items
        scores = torch.matmul(user_emb[user], item_emb.T)

        # mask training interactions (don't recommend items user already read)
        scores[list(train_items)] = -1e9

        # top-k predicted items
        top_k_items = torch.topk(scores, k=k).indices.numpy()

        # count how many correct hits
        hits = sum([1 for item in true_items if item in top_k_items])

        recall = hits / len(true_items)
        recalls.append(recall)

    return np.mean(recalls)


In [75]:
recall10 = recall_at_k(model, user_emb, item_emb, test_interactions, train_interactions, k=10)
recall20 = recall_at_k(model, user_emb, item_emb, test_interactions, train_interactions, k=20)

print("Recall@10:", recall10)
print("Recall@20:", recall20)


Recall@10: 0.022563315425940138
Recall@20: 0.03960092095165004


### Improving the performance
Convert ratings → implicit (1/0)
Filter sparse users/items
Increase embedding size
Train for 100 epochs
Tune hyperparameters
Re-evaluate Recall@K

In [76]:
import pandas as pd

# df_ratings must have: user_id, isbn, rating
implicit_df = ratings.copy()

# convert to implicit feedback
implicit_df['rating'] = (implicit_df['rating'] >= 7).astype(int)

# keep only positive interactions
implicit_df = implicit_df[implicit_df['rating'] == 1]

print("Positive interactions:", len(implicit_df))


Positive interactions: 111435


In [77]:
# filter users
user_counts = implicit_df['user_id'].value_counts()
implicit_df = implicit_df[implicit_df['user_id'].isin(user_counts[user_counts >= 5].index)]

# filter items
item_counts = implicit_df['isbn'].value_counts()
implicit_df = implicit_df[implicit_df['isbn'].isin(item_counts[item_counts >= 5].index)]

print("After filtering:", implicit_df.shape)


After filtering: (81785, 5)


In [90]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

implicit_df["user_id_enc"] = user_encoder.fit_transform(implicit_df["user_id"])
implicit_df["isbn_enc"] = item_encoder.fit_transform(implicit_df["isbn"])

num_users = implicit_df["user_id_enc"].nunique()
num_items = implicit_df["isbn_enc"].nunique()

print("Users:", num_users, "Items:", num_items)


Users: 5720 Items: 6821


In [91]:
import numpy as np

# create new IDs
user_encoder = {u: i for i, u in enumerate(implicit_df['user_id'].unique())}
item_encoder = {i: j for j, i in enumerate(implicit_df['isbn'].unique())}

implicit_df['user_id_enc'] = implicit_df['user_id'].map(user_encoder)
implicit_df['isbn_enc'] = implicit_df['isbn'].map(item_encoder)

num_users = len(user_encoder)
num_items = len(item_encoder)

num_users, num_items


(5720, 6821)

In [92]:
train_df = implicit_df[["user_id_enc", "isbn_enc"]].copy()


In [93]:
train_interactions = (
    implicit_df.groupby("user_id_enc")["isbn_enc"]
    .apply(set)
    .to_dict()
)

# ensure all users exist
for u in range(num_users):
    if u not in train_interactions:
        train_interactions[u] = set()


In [94]:
from torch.utils.data import DataLoader, TensorDataset

def train_lightgcn(model, train_df, train_interactions, epochs=15, batch_size=2048, lambda_reg=1e-4, lr=0.001):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # FIXED COLUMN NAMES
    users = torch.LongTensor(train_df['user_id_enc'].values)
    items = torch.LongTensor(train_df['isbn_enc'].values)

    dataset = TensorDataset(users, items)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        total_loss = 0
        
        for batch_users, batch_pos_items in loader:
            
            # sample negative items
            batch_neg_items = negative_sampling(
                batch_users.numpy(),
                train_interactions,
                model.num_items
            )

            # forward pass
            pos_scores, neg_scores = model(batch_users, batch_pos_items, batch_neg_items)

            # final embeddings
            final_user_emb, final_item_emb = model.propagate()
            u_emb = final_user_emb[batch_users]
            pos_emb = final_item_emb[batch_pos_items]
            neg_emb = final_item_emb[batch_neg_items]

            # BPR loss
            loss = bpr_loss(pos_scores, neg_scores, lambda_reg, u_emb, pos_emb, neg_emb)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs} | Loss = {total_loss:.4f}")


In [79]:
import scipy.sparse as sp

rows = implicit_df['user_id_enc'].values
cols = implicit_df['isbn_enc'].values

interaction_matrix = sp.coo_matrix(
    (np.ones(len(implicit_df)), (rows, cols)),
    shape=(num_users, num_items)
).tocsr()

interaction_matrix


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 81785 stored elements and shape (5720, 6821)>

## Retrainig the model again

In [87]:
train_interactions = (
    implicit_df.groupby("user_id_enc")["isbn_enc"]
    .apply(set)
    .to_dict()
)


In [88]:
max(train_interactions.keys()), num_users


(5719, 5720)

In [99]:
num_users = train_df['user_id_enc'].max() + 1
num_items = train_df['isbn_enc'].max() + 1

print("num_users:", num_users)
print("num_items:", num_items)
print("total nodes:", num_users + num_items)


num_users: 5720
num_items: 6821
total nodes: 12541


In [100]:
import scipy.sparse as sp
import numpy as np
import torch

def build_adj_matrix(train_df, num_users, num_items):
    rows = train_df['user_id_enc'].values
    cols = train_df['isbn_enc'].values + num_users  # item index shifted

    data = np.ones(len(train_df))

    # bipartite graph edges (user → item)
    adjacency = sp.coo_matrix(
        (data, (rows, cols)),
        shape=(num_users + num_items, num_users + num_items)
    )

    # symmetric edges (item → user)
    adjacency = adjacency + adjacency.T  

    return adjacency.tocsr()

adj_matrix = build_adj_matrix(train_df, num_users, num_items)


In [102]:
def normalize_adj(adj):
    rowsum = np.array(adj.sum(axis=1)).flatten()
    d_inv = np.power(rowsum, -0.5)
    d_inv[np.isinf(d_inv)] = 0.0
    d_mat = sp.diags(d_inv)
    return d_mat @ adj @ d_mat

norm_adj = normalize_adj(adj_matrix)


In [103]:
def convert_sparse_matrix_to_torch_sparse(matrix):
    matrix = matrix.tocoo()
    indices = torch.from_numpy(np.vstack((matrix.row, matrix.col)).astype(np.int64))
    values  = torch.from_numpy(matrix.data.astype(np.float32))
    shape   = torch.Size(matrix.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

norm_adj_tensor = convert_sparse_matrix_to_torch_sparse(norm_adj)


In [104]:
model = LightGCN(
    num_users=num_users,
    num_items=num_items,
    embedding_dim=64,
    num_layers=3,
    norm_adj_matrix=norm_adj_tensor
)

train_lightgcn(
    model=model,
    train_df=train_df,
    train_interactions=train_interactions,
    batch_size=4096,
    epochs=100,
    lambda_reg=1e-4,
    lr=0.001
)


Epoch 1/100 | Loss = 13.8582
Epoch 2/100 | Loss = 13.8252
Epoch 3/100 | Loss = 13.6714
Epoch 4/100 | Loss = 13.3033
Epoch 5/100 | Loss = 12.7412
Epoch 6/100 | Loss = 12.1060
Epoch 7/100 | Loss = 11.4923
Epoch 8/100 | Loss = 10.9762
Epoch 9/100 | Loss = 10.5137
Epoch 10/100 | Loss = 10.1415
Epoch 11/100 | Loss = 9.8226
Epoch 12/100 | Loss = 9.5566
Epoch 13/100 | Loss = 9.3453
Epoch 14/100 | Loss = 9.1548
Epoch 15/100 | Loss = 8.9541
Epoch 16/100 | Loss = 8.8072
Epoch 17/100 | Loss = 8.6630
Epoch 18/100 | Loss = 8.4986
Epoch 19/100 | Loss = 8.3884
Epoch 20/100 | Loss = 8.2739
Epoch 21/100 | Loss = 8.1765
Epoch 22/100 | Loss = 8.0467
Epoch 23/100 | Loss = 7.8985
Epoch 24/100 | Loss = 7.9130
Epoch 25/100 | Loss = 7.7637
Epoch 26/100 | Loss = 7.6491
Epoch 27/100 | Loss = 7.5762
Epoch 28/100 | Loss = 7.4621
Epoch 29/100 | Loss = 7.4115
Epoch 30/100 | Loss = 7.3290
Epoch 31/100 | Loss = 7.1471
Epoch 32/100 | Loss = 7.1407
Epoch 33/100 | Loss = 7.0558
Epoch 34/100 | Loss = 6.9135
Epoch 35/100 

In [105]:
# ground truth interactions from test set (encoded IDs)
test_interactions = (
    test_df.groupby("user_id_enc")["isbn_enc"]
           .apply(set)
           .to_dict()
)


KeyError: 'user_id_enc'