In [3]:
# Install required libraries
!pip install pandas numpy torch torch-geometric scikit-learn sentence-transformers --quiet

In [4]:
import pandas as pd
import numpy as np

# Load MovieLens ratings and movies
ratings_df = pd.read_csv('/content/drive/MyDrive/Recommender System/ml-latest-small/ratings.csv')  # columns: userId, movieId, rating, timestamp
movies_df = pd.read_csv('/content/drive/MyDrive/Recommender System/ml-latest-small/movies.csv')    # columns: movieId, title, genres

# Load Wikipedia plots (ensure 'Title' and 'Plot' columns)
wiki_df = pd.read_csv('/content/drive/MyDrive/Recommender System/wiki_movie_plots_deduped.csv')

# Title normalization for matching
movies_df['title_lower'] = movies_df['title'].str.lower().str.replace(r'\([^)]*\)', '', regex=True).str.strip()
wiki_df['Title_lower'] = wiki_df['Title'].str.lower().str.strip()

# Merge MovieLens movies with Wikipedia plots
movie_plots = pd.merge(movies_df, wiki_df, left_on='title_lower', right_on='Title_lower', how='left')


Feature Engineering

In [5]:
from sentence_transformers import SentenceTransformer

plot_texts = movie_plots['Plot'].fillna('').tolist()
sbert = SentenceTransformer('all-MiniLM-L6-v2')
plot_embeddings = sbert.encode(plot_texts, show_progress_bar=True)  # shape: (num_movies, emb_dim)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

movie_plots['genres_list'] = movie_plots['genres'].fillna('').apply(lambda x: x.split('|') if x else [])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movie_plots['genres_list'])


In [7]:
movie_features = np.concatenate([plot_embeddings, genre_matrix], axis=1)


//Index Mapping//

In [8]:
user_id_map = {id_: idx for idx, id_ in enumerate(ratings_df['userId'].unique())}
movie_id_map = {id_: idx for idx, id_ in enumerate(movie_plots['movieId'].unique())}

ratings_df['user_idx'] = ratings_df['userId'].map(user_id_map)
ratings_df['movie_idx'] = ratings_df['movieId'].map(movie_id_map)
movie_plots['movie_idx'] = movie_plots['movieId'].map(movie_id_map)


In [27]:
data = HeteroData()
num_users = len(user_id_map)
num_movies = len(movie_id_map)
feature_dim = movie_features.shape[1]

# User nodes (dummy features)
data['user'].x = torch.zeros((num_users, feature_dim), dtype=torch.float)

# Movie nodes (features: plot+genre)
data['movie'].x = torch.tensor(movie_features, dtype=torch.float)

# User-movie edges
edge_index_user_movie = torch.tensor([
    ratings_df['user_idx'].values,
    ratings_df['movie_idx'].values
], dtype=torch.long)
data['user', 'rates', 'movie'].edge_index = edge_index_user_movie
data['movie', 'rev_rates', 'user'].edge_index = edge_index_user_movie.flip(0)


In [46]:
sim_matrix = cosine_similarity(movie_features)
k = 10

movie_src, movie_dst = [], []
for i in range(num_movies):
    top_k = np.argsort(sim_matrix[i])[-(k+1):-1]
    for j in top_k:
        movie_src.append(i)
        movie_dst.append(j)
edge_index_movie_movie = torch.tensor([movie_src, movie_dst], dtype=torch.long)
data['movie', 'similar', 'movie'].edge_index = edge_index_movie_movie

print(f"Number of movie-movie edges: {len(movie_src)}")


Number of movie-movie edges: 97420


In [47]:
num_edges = edge_index_user_movie.shape[1]
indices = np.arange(num_edges)
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
val_idx, test_idx = train_test_split(test_idx, test_size=0.5, random_state=42)

def mask_edges(idx):
    return edge_index_user_movie[:, idx], torch.tensor(ratings_df.iloc[idx]['rating'].values, dtype=torch.float)

train_edges, train_ratings = mask_edges(train_idx)
val_edges, val_ratings = mask_edges(val_idx)
test_edges, test_ratings = mask_edges(test_idx)

# Assign only train edges to graph for training
data['user', 'rates', 'movie'].edge_index = train_edges
data['movie', 'rev_rates', 'user'].edge_index = train_edges.flip(0)


In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move node features to device
data['user'].x = data['user'].x.to(device)
data['movie'].x = data['movie'].x.to(device)

# Move all edge indices to device
for edge_type in data.edge_types:
    data[edge_type].edge_index = data[edge_type].edge_index.to(device)

# Move split tensors to device
train_users, train_movies = train_edges[0].to(device), train_edges[1].to(device)
val_users, val_movies = val_edges[0].to(device), val_edges[1].to(device)
test_users, test_movies = test_edges[0].to(device), test_edges[1].to(device)

train_ratings_t = train_ratings.to(device)
val_ratings_t = val_ratings.to(device)
test_ratings_t = test_ratings.to(device)

# Print to confirm all are on the same device
print("User node features device:", data['user'].x.device)
print("Movie node features device:", data['movie'].x.device)
for edge_type in data.edge_types:
    print(f"{edge_type} edge_index device:", data[edge_type].edge_index.device)
print("train_users device:", train_users.device)
print("train_movies device:", train_movies.device)
print("train_ratings_t device:", train_ratings_t.device)


User node features device: cuda:0
Movie node features device: cuda:0
('user', 'rates', 'movie') edge_index device: cuda:0
('movie', 'rev_rates', 'user') edge_index device: cuda:0
('movie', 'similar', 'movie') edge_index device: cuda:0
train_users device: cuda:0
train_movies device: cuda:0
train_ratings_t device: cuda:0


In [49]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, metadata):
        super().__init__()
        self.conv1 = HeteroConv({
            edge_type: SAGEConv((-1, -1), hidden_channels)
            for edge_type in metadata[1]
        }, aggr='sum')
        self.conv2 = HeteroConv({
            edge_type: SAGEConv((hidden_channels, hidden_channels), hidden_channels)
            for edge_type in metadata[1]
        }, aggr='sum')
    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {k: F.relu(v) for k, v in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

class MoRGHModel(torch.nn.Module):
    def __init__(self, hidden_channels, metadata):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, metadata)
        self.decoder = nn.Sequential(
            nn.Linear(hidden_channels * 2, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, 1)
        )
    def forward(self, x_dict, edge_index_dict, user_movie_edges):
        embeddings = self.encoder(x_dict, edge_index_dict)
        user_emb = embeddings['user']
        movie_emb = embeddings['movie']
        users = user_movie_edges[0]
        movies = user_movie_edges[1]
        edge_emb = torch.cat([user_emb[users], movie_emb[movies]], dim=1)
        rating_pred = self.decoder(edge_emb).squeeze()
        return rating_pred

model = MoRGHModel(hidden_channels=64, metadata=data.metadata()).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()


In [61]:
for epoch in range(1, 101):
    model.train()
    optimizer.zero_grad()
    pred = model(data.x_dict, data.edge_index_dict, (train_users, train_movies))
    loss = loss_fn(pred, train_ratings_t)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_pred = model(data.x_dict, data.edge_index_dict, (val_users, val_movies))
        val_loss = loss_fn(val_pred, val_ratings_t)
    if epoch % 5 == 0:
        print(f'Epoch {epoch:03d}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')


Epoch 005, Train Loss: 0.8905, Val Loss: 0.9583
Epoch 010, Train Loss: 0.8888, Val Loss: 0.9586
Epoch 015, Train Loss: 0.8870, Val Loss: 0.9595
Epoch 020, Train Loss: 0.8848, Val Loss: 0.9543
Epoch 025, Train Loss: 0.8824, Val Loss: 0.9518
Epoch 030, Train Loss: 0.8994, Val Loss: 0.9899
Epoch 035, Train Loss: 0.8959, Val Loss: 0.9627
Epoch 040, Train Loss: 0.8786, Val Loss: 0.9464
Epoch 045, Train Loss: 0.8806, Val Loss: 0.9494
Epoch 050, Train Loss: 0.8734, Val Loss: 0.9439
Epoch 055, Train Loss: 0.8723, Val Loss: 0.9441
Epoch 060, Train Loss: 0.8692, Val Loss: 0.9430
Epoch 065, Train Loss: 0.8673, Val Loss: 0.9421
Epoch 070, Train Loss: 0.8659, Val Loss: 0.9447
Epoch 075, Train Loss: 0.8638, Val Loss: 0.9456
Epoch 080, Train Loss: 0.8633, Val Loss: 0.9382
Epoch 085, Train Loss: 0.8600, Val Loss: 0.9405
Epoch 090, Train Loss: 0.8592, Val Loss: 0.9506
Epoch 095, Train Loss: 0.9145, Val Loss: 0.9406
Epoch 100, Train Loss: 0.8626, Val Loss: 0.9497


In [62]:
model.eval()
with torch.no_grad():
    test_pred = model(data.x_dict, data.edge_index_dict, (test_users, test_movies))
    rmse = torch.sqrt(loss_fn(test_pred, test_ratings_t))
print(f'Test RMSE: {rmse.item():.4f}')


Test RMSE: 0.9842
