In [None]:
%cd ~/Documents/comp3610A3
import polars as pl
import numpy as np
import implicit
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from scipy.sparse import coo_matrix, csr_array
from amazon.models import normalize_scores  
import random
random.seed(42)

C:\Users\Lorenzo\Documents\comp3610A3


In [34]:
lf: pl.LazyFrame = pl.scan_parquet("data/processed/amazon-2023.parquet")
lf = lf.select(["user_id", "asin", "rating"])
lf = lf.filter(pl.len().over("user_id") >= 5)

lf = lf.with_columns([
    pl.col("user_id").cast(pl.Categorical).alias("user_cat"),
    pl.col("asin").cast(pl.Categorical).alias("item_cat")
]).drop(["user_id", "asin"])

lf: pl.LazyFrame = lf.with_columns([
    pl.col("user_cat").to_physical().alias("user_idx"),
    pl.col("item_cat").to_physical().alias("item_idx")
])

df: pl.DataFrame = lf.collect()

In [35]:
unique_users, unique_items = df["user_cat"].unique().sort(), df["item_cat"].unique().sort()

user_id_map = {i: user_str for i, user_str in enumerate(unique_users.to_list())}
item_id_map = {i: item_str for i, item_str in enumerate(unique_items.to_list())}

ratings_np = df['rating'].cast(pl.Float32).to_numpy()
user_indices_np = df['user_idx'].to_numpy()
item_indices_np = df['item_idx'].to_numpy()

num_users, num_items = len(user_id_map), len(item_id_map)
train_indices, test_indices = train_test_split(np.arange(len(df)), test_size=0.2, random_state=42)

train_user_indices, train_item_indices = user_indices_np[train_indices], item_indices_np[train_indices]
train_ratings = ratings_np[train_indices]

test_user_indices, test_item_indices = user_indices_np[test_indices], item_indices_np[test_indices]
test_ratings = ratings_np[test_indices]

In [36]:
train_sparse_csr: csr_array = coo_matrix((train_ratings, (train_user_indices, train_item_indices)),
                            shape=(num_users, num_items)).tocsr()

In [37]:
model = implicit.als.AlternatingLeastSquares(
    factors=50, regularization=0.01, iterations=15, random_state=42, use_gpu=False
)
model.fit(train_sparse_csr)

  0%|          | 0/15 [00:00<?, ?it/s]

In [38]:
predicted_ratings_test: list[float] = [
    model.user_factors[user_idx, :].dot(model.item_factors[item_idx, :])
    for user_idx, item_idx, actual in zip(test_user_indices, test_item_indices, test_ratings)
    if user_idx < model.user_factors.shape[0] and item_idx < model.item_factors.shape[0]
]

if predicted_ratings_test:
    rmse: float = root_mean_squared_error(test_ratings[:len(predicted_ratings_test)], predicted_ratings_test)
    print(f"Root-mean-square error (RMSE) on test data = {rmse:.4f}")
else:
    print("Warning: No test ratings could be predicted.")
    rmse = float('nan')

Root-mean-square error (RMSE) on test data = 4.4926


In [39]:
# %% Demo: Top‑5 recommendations for 3 random test users
unique_test_users: np.ndarray = np.unique(test_user_indices)
demo_users: list[int] = random.sample(list(unique_test_users), 3)

for user_idx in demo_users:
    scores = model.user_factors[user_idx].dot(model.item_factors.T)
    seen = set(train_item_indices[train_user_indices == user_idx])
    scores[list(seen)] = -np.inf  # mask seen

    scores: np.ndarray = normalize_scores(scores)
    top5_idx: np.ndarray = np.argpartition(scores, -5)[-5:]
    top5_idx = top5_idx[np.argsort(-scores[top5_idx])]

    print(f"\nUser {user_id_map[user_idx]} (idx={user_idx}):")
    for item_idx in top5_idx:
        print(f"  ASIN {item_id_map[item_idx]} — pred. rating {scores[item_idx]:.2f}")


User AGCYDOSYCLJAPBNO4T7CUKBSXRDA (idx=33889):
  ASIN B001IKCGLM — pred. rating 5.00
  ASIN B00RQPUV0Q — pred. rating 4.51
  ASIN B07FZ8S74R — pred. rating 4.12
  ASIN B01LXZDPDR — pred. rating 3.82
  ASIN B000P0LNRE — pred. rating 3.65

User AGLKX4WZSC5DTW2N657GC7XDF66Q (idx=7639):
  ASIN B005UEB96K — pred. rating 5.00
  ASIN B01415QHYW — pred. rating 3.76
  ASIN B00081J3N6 — pred. rating 3.66
  ASIN B009EDSWJA — pred. rating 3.34
  ASIN B00ECHYTBI — pred. rating 3.32

User AFVYFZDT62MIXIEGU4IAOAAOYWXQ (idx=83994):
  ASIN B00TSUGXKE — pred. rating 5.00
  ASIN B00IKM5N02 — pred. rating 4.61
  ASIN B00ZV9RDKK — pred. rating 4.54
  ASIN B01DFKC2SO — pred. rating 4.11
  ASIN B010Q57T02 — pred. rating 3.99
