### Baseline Models

In [None]:
pip install implicit


Collecting implicit
  Using cached implicit-0.7.2.tar.gz (70 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.7.2-cp312-cp312-linux_x86_64.whl size=10790362 sha256=59447919390f35cfb703d31961b877bcc825637ca1f80ae3de0ff833b2fcc1e5
  Stored in directory: /root/.cache/pip/wheels/b2/00/4f/9ff8af07a0a53ac6007ea5d739da19cfe147a2df542b6899f8
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import os
from implicit.als import AlternatingLeastSquares

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load data


In [None]:
output_dir = '/content/drive/MyDrive/IMDB_processed_data/'
movies = pd.read_csv(os.path.join(output_dir,"movies.csv"))
people = pd.read_csv(os.path.join(output_dir,"people.csv"))
rels = pd.read_csv(os.path.join(output_dir,"actor_director_movies_edges.csv"))

display(movies.head())

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,region
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance",6.4,92761,HK
1,tt0054724,movie,I Eat Your Skin,Zombie,0,1971,\N,92,Horror,3.6,1792,IN
2,tt0061592,movie,Doomsday Machine,Doomsday Machine,0,1976,\N,83,Sci-Fi,2.6,1526,CA
3,tt0063142,movie,Isle of the Snake People,La muerte viviente,0,1971,\N,90,"Horror,Mystery",3.4,1135,CA
4,tt0064451,movie,A Touch of Zen,Xia nü,0,1971,\N,200,"Action,Adventure,Drama",7.5,7996,XWW


## Cast Predicition using Traditional Collaborative Filtering through Alternating Least Scales (ALS)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

movie_map = {mid: i for i, mid in enumerate(rels["tconst"].unique())}
person_map = {pid: i for i, pid in enumerate(rels["nconst"].unique())}

rels["movie_id"] = rels["tconst"].map(movie_map)
rels["person_id"] = rels["nconst"].map(person_map)

unique_movies = rels["tconst"].unique()
# Get a test-train split for the movies
train_movies, test_movies = train_test_split(unique_movies, test_size=0.2, random_state=42)

train_df = rels[rels["tconst"].isin(train_movies)]
test_df = rels[rels["tconst"].isin(test_movies)]

print(f"Train movies: {len(train_movies)}, Test movies: {len(test_movies)}")
print(f"Train edges: {len(train_df)}, Test edges: {len(test_df)}")


R_train = csr_matrix(
    (np.ones(len(train_df)), (train_df["movie_id"], train_df["person_id"]))
)

model = AlternatingLeastSquares(
    factors=64,
    regularization=0.1,
    iterations=20,
    random_state=42,
    use_gpu=False,
)

model.fit(R_train.T)

movie_factors = model.user_factors
person_factors = model.item_factors


def recall_at_k(model, train_df, test_df, k=10):
    recall_scores = []

    test_truth = test_df.groupby("movie_id")["person_id"].apply(set).to_dict()
    train_edges = train_df.groupby("movie_id")["person_id"].apply(set).to_dict()

    for movie_id, true_people in test_truth.items():
        if movie_id >= model.user_factors.shape[0]:
            continue

        scores = model.item_factors @ model.user_factors[movie_id]
        topk = np.argpartition(-scores, k)[:k]
        hits = len(set(topk) & true_people)
        recall_scores.append(hits / len(true_people))

    return np.mean(recall_scores) if recall_scores else 0.0

recall100 = recall_at_k(model, train_df, test_df, k=10)
print(f"Cast Recommendation Recall@10 (movie-level): {recall100:.3f}")

movie_title_map = dict(zip(movies["tconst"], movies["primaryTitle"]))
inv_movie_map = {v: k for k, v in movie_map.items()}
inv_person_map = {v: k for k, v in person_map.items()}


sample_test = test_df.sample(1, random_state=1).iloc[0]
movie_id = sample_test["movie_id"]
movie_tconst = inv_movie_map[movie_id]
movie_title = movie_title_map.get(movie_tconst, movie_tconst)

print(f"\nTop recommended people for movie: {movie_title}")

scores = person_factors @ movie_factors[movie_id]
topk = np.argpartition(-scores, 10)[:10]
recommended_people = [inv_person_map[i] for i in topk]
recommended_names = people[people["nconst"].isin(recommended_people)]["primaryName"].tolist()

print(recommended_names)


Train movies: 29760, Test movies: 7441
Train edges: 319952, Test edges: 79897




  0%|          | 0/20 [00:00<?, ?it/s]

Cast Recommendation Recall@10 (movie-level): 0.000

Top recommended people for movie: Cruel Intentions
['Leelee Sobieski', 'Pavlik Jansen op de Haar', 'Mike van Diem', 'Tamar van den Dop', 'Frankie J. Galasso', 'Sam Huntington', 'Fedja van Huêt', 'Victor Löw', 'Betty Schuurman', 'Alan Wade']


## Rating Prediction using Node2Vec + XGBoost

In [None]:

!pip uninstall -y torch-geometric torch-sparse torch-scatter torch-cluster torch-spline-conv pyg-lib
import torch
torch_version = str(torch.__version__)
scatter_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
sparse_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
cluster_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
!pip install torch-scatter -f $scatter_src
!pip install torch-sparse -f $sparse_src
!pip install torch-cluster -f $cluster_src
!pip install torch-geometric

[0mLooking in links: https://pytorch-geometric.com/whl/torch-2.8.0+cu126.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_scatter-2.1.2%2Bpt28cu126-cp312-cp312-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt28cu126
Looking in links: https://pytorch-geometric.com/whl/torch-2.8.0+cu126.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_sparse-0.6.18%2Bpt28cu126-cp312-cp312-linux_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt28cu126
Looking in links: https://pytorch-geometric.com/whl/torch-2.8.0+cu126.html
Collecting torch-clust

In [None]:
pip install cupy-cuda12x pandas scikit-learn torch-cluster



## Train Node2Vec to get node embeddings

In [None]:
import torch
from torch_geometric.nn import Node2Vec
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import cupy as cp
import os


output_dir = '/content/drive/MyDrive/IMDB_processed_data/'
movies = pd.read_csv(os.path.join(output_dir,"movies.csv"))
people = pd.read_csv(os.path.join(output_dir,"people.csv"))
rels = pd.read_csv(os.path.join(output_dir,"actor_director_movies_edges.csv"))

display(movies.head())
# Filter and map IDs
all_movies = rels["tconst"].unique()
all_people = rels["nconst"].unique()

movie_to_idx = {m: i for i, m in enumerate(all_movies)}
person_to_idx = {p: i + len(all_movies) for i, p in enumerate(all_people)}

# Build edge list: (movie, person)
src = [movie_to_idx[m] for m in rels["tconst"]]
dst = [person_to_idx[p] for p in rels["nconst"]]
edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long)

num_nodes = len(all_movies) + len(all_people)
print(f"Graph nodes: {num_nodes}, edges: {edge_index.shape[1]}")

data = Data(edge_index=edge_index, num_nodes=num_nodes)


device = "cuda" if torch.cuda.is_available() else "cpu"
model = Node2Vec(
    data.edge_index,
    embedding_dim=128,
    walk_length=20,
    context_size=10,
    walks_per_node=5,
    num_negative_samples=1,
    p=1,
    q=1,
    sparse=True
).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=2)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

print("Training Node2Vec on GPU...")
for epoch in range(1, 6):
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}: loss = {total_loss:.3f}")

embeddings = model.forward()
print("Node2Vec embeddings computed.")



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,region
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance",6.4,92761,HK
1,tt0054724,movie,I Eat Your Skin,Zombie,0,1971,\N,92,Horror,3.6,1792,IN
2,tt0061592,movie,Doomsday Machine,Doomsday Machine,0,1976,\N,83,Sci-Fi,2.6,1526,CA
3,tt0063142,movie,Isle of the Snake People,La muerte viviente,0,1971,\N,90,"Horror,Mystery",3.4,1135,CA
4,tt0064451,movie,A Touch of Zen,Xia nü,0,1971,\N,200,"Action,Adventure,Drama",7.5,7996,XWW


Graph nodes: 200635, edges: 799698
Training Node2Vec on GPU...
Epoch 1: loss = 8168.254
Epoch 2: loss = 2920.983
Epoch 3: loss = 1862.009
Epoch 4: loss = 1543.920
Epoch 5: loss = 1425.314
Node2Vec embeddings computed.


## Run XGBoost on embeddings obtained from node2Vec

In [None]:
import xgboost as xgb
import cupy as cp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


movie_emb_cp = embeddings[:len(all_movies)].detach().to('cuda')
person_emb_cp = embeddings[len(all_movies):].detach().to('cuda')



# Split movies for train/test
train_movies, test_movies = train_test_split(
    movies["tconst"].unique(), test_size=0.2, random_state=42
)

train_df = rels[rels["tconst"].isin(train_movies)].merge(
    movies[["tconst", "averageRating"]], on="tconst", how="left"
)
test_df = rels[rels["tconst"].isin(test_movies)].merge(
    movies[["tconst", "averageRating"]], on="tconst", how="left"
)

print(train_df.head())
def make_features_gpu_vectorized(df, movie_to_idx, person_to_idx, movie_emb_cp, person_emb_cp, all_movies):

    # Map movie and person IDs to indices
    rels = df.copy()
    rels["movie_idx"] = rels["tconst"].map(movie_to_idx)
    rels["person_idx"] = rels["nconst"].map(person_to_idx) - len(all_movies)
    rels = rels.dropna(subset=["movie_idx", "person_idx"])

    # Group by movie to collect all connected person embeddings
    movie_features = []
    movie_ids = []
    for movie_id, group in rels.groupby("tconst"):
        movie_idx = int(group["movie_idx"].iloc[0])
        person_indices = group["person_idx"].astype(int).to_numpy()

        # Average all person embeddings for that movie
        person_vecs = person_emb_cp[person_indices]
        person_mean = person_vecs.mean(dim=0)

        # Get the movie embedding
        movie_vec = movie_emb_cp[movie_idx]

        # Concatenate [movie embedding | mean(person embeddings)]
        combined = cp.concatenate([movie_vec, person_mean])
        movie_features.append(combined)
        movie_ids.append(movie_id)

    # Stack into one array
    X = cp.stack(movie_features)
    movie_ids = np.array(movie_ids)

    # Match ratings
    y = movies.set_index("tconst").loc[movie_ids, "averageRating"].to_numpy()
    y = cp.asarray(y)

    return X, y


X_train, y_train = make_features_gpu_vectorized(train_df, movie_to_idx, person_to_idx, movie_emb_cp, person_emb_cp, all_movies)
X_test, y_test = make_features_gpu_vectorized(test_df, movie_to_idx, person_to_idx, movie_emb_cp, person_emb_cp, all_movies)

X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    cp.asnumpy(X_train), cp.asnumpy(y_train), test_size=0.1, random_state=42
)

dtrain = xgb.DMatrix(X_train_sub, label=y_train_sub)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(cp.asnumpy(X_test), label=cp.asnumpy(y_test))



dtrain = xgb.DMatrix(X_train.get(), label=y_train.get())
dtest = xgb.DMatrix(X_test.get(), label=y_test.get())

params = {
    "tree_method": "hist",
    "predictor": "gpu_predictor",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 4,
    "learning_rate": 0.05,
    "reg_alpha": 2.0,
    "reg_lambda": 4.0,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

print("Training XGBoost with early stopping...")
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, "train"), (dval, "val")],
    early_stopping_rounds=50,
    verbose_eval=20
)

preds = bst.predict(dtest)
rmse = mean_squared_error(cp.asnumpy(y_test), preds)
print(f"Rating Prediction RMSE: {rmse:.3f}")




      tconst  ordering     nconst category job         characters  \
0  tt0035423         1  nm0000212  actress  \N     ["Kate McKay"]   
1  tt0035423         2  nm0413168    actor  \N        ["Leopold"]   
2  tt0035423         3  nm0000630    actor  \N  ["Stuart Besser"]   
3  tt0035423         4  nm0005227    actor  \N  ["Charlie McKay"]   
4  tt0035423         5  nm0005169  actress  \N          ["Darci"]   

   averageRating  
0            6.4  
1            6.4  
2            6.4  
3            6.4  
4            6.4  
Training XGBoost with early stopping...


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:1.16182	val-rmse:1.14740
[20]	train-rmse:1.09482	val-rmse:1.08088
[40]	train-rmse:1.06314	val-rmse:1.04822
[60]	train-rmse:1.04278	val-rmse:1.02817
[80]	train-rmse:1.02718	val-rmse:1.01272
[100]	train-rmse:1.01370	val-rmse:0.99921
[120]	train-rmse:1.00191	val-rmse:0.98704
[140]	train-rmse:0.99122	val-rmse:0.97537
[160]	train-rmse:0.98213	val-rmse:0.96648
[180]	train-rmse:0.97319	val-rmse:0.95703
[200]	train-rmse:0.96482	val-rmse:0.94855
[220]	train-rmse:0.95717	val-rmse:0.94131
[240]	train-rmse:0.95005	val-rmse:0.93327
[260]	train-rmse:0.94339	val-rmse:0.92625
[280]	train-rmse:0.93630	val-rmse:0.91914
[300]	train-rmse:0.92976	val-rmse:0.91317
[320]	train-rmse:0.92329	val-rmse:0.90658
[340]	train-rmse:0.91688	val-rmse:0.89958
[360]	train-rmse:0.91060	val-rmse:0.89344
[380]	train-rmse:0.90500	val-rmse:0.88733
[400]	train-rmse:0.89923	val-rmse:0.88165
[420]	train-rmse:0.89372	val-rmse:0.87567
[440]	train-rmse:0.88873	val-rmse:0.87048
[460]	train-rmse:0.88310	val-rmse:0.8643

In [None]:
import cupy as cp
from sklearn.metrics import mean_squared_error


mean_rating = cp.mean(y_train)  # y_train is your CuPy array of ratings
print(f"Mean training rating: {mean_rating:.3f}")


y_pred_mean = cp.full_like(y_test, mean_rating)



rmse_baseline = mean_squared_error(cp.asnumpy(y_test), cp.asnumpy(y_pred_mean))
print(f"Baseline RMSE (predict mean rating): {rmse_baseline:.3f}")


Mean training rating: 6.185
Baseline RMSE (predict mean rating): 1.354
