<a href="https://colab.research.google.com/github/asalem2/E-Commerce-Product-Recommendation-System/blob/main/als.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install implicit

In [None]:
# Cell 1: Install and download the April 2020 file

# Install gdown and other dependencies
!pip install gdown implicit pandas scipy --quiet

# Use gdown to fetch the file into /content
import gdown

file_id = "1uAKPWKwSWJWDmnnNWw4EQExQfhbiYLD5"
url     = f"https://drive.google.com/uc?id={file_id}"
output  = "/content/2020-Apr.csv.gz"
gdown.download(url, output, quiet=False)


In [None]:
# Cell 2: Load 10 M rows, remap IDs, and build inv_item_map

import pandas as pd

# 1) Load first 10 M rows
df = pd.read_csv(
    "/content/2020-Apr.csv.gz",
    nrows=10_000_000,
    parse_dates=["event_time"]
)

# 2) Zero-based remapping for ALS
user_ids = df["user_id"].unique()
item_ids = df["product_id"].unique()
user_map = {u: i for i, u in enumerate(user_ids)}
item_map = {p: i for i, p in enumerate(item_ids)}
df["user_idx"] = df["user_id"].map(user_map)
df["item_idx"] = df["product_id"].map(item_map)

# 3) Build inverse lookup so recs map back perfectly
inv_item_map = { idx: prod_id for prod_id, idx in item_map.items() }


In [None]:
# Cell 2b: Build a fresh inverse lookup
inv_item_map = { idx: prod_id
                 for prod_id, idx in item_map.items() }


In [None]:
# Cell 3: Build implicit-feedback confidence matrix

from scipy.sparse import coo_matrix

# Assign event-type confidence weights
weight_map = {"view": 1.0, "cart": 3.0, "purchase": 5.0}
df["confidence"] = df["event_type"].map(weight_map).fillna(0.0)

# Build item×user sparse matrix
n_users = len(user_ids)
n_items = len(item_ids)
conf_mat = coo_matrix(
    (df["confidence"], (df["item_idx"], df["user_idx"])),
    shape=(n_items, n_users)
).tocsr()


In [None]:
# Cell 4: Train the ALS model

from implicit.als import AlternatingLeastSquares

als = AlternatingLeastSquares(
    factors=50,
    regularization=0.01,
    iterations=15,
    use_gpu=False
)
als.fit(conf_mat)


In [None]:
# Cell 5: Refactored sampled Precision@10 evaluation

import random
import time
from collections import defaultdict
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares

def build_ground_truth(test_df):
    gt = defaultdict(set)
    for u, i in zip(test_df["user_idx"], test_df["item_idx"]):
        gt[u].add(i)
    return gt

def sample_users(gt, k=100, seed=42):
    random.seed(seed)
    return random.sample(list(gt.keys()), k)

def precision_at_k(model, train_conf_mat, gt, users, K=10):
    user_items = train_conf_mat.T.tocsr()           # cache once
    prec_sum = 0.0
    for u in users:
        true_items = gt[u]
        user_vec   = user_items[u, :]               # a 1×n_items CSR
        rec_idxs, _ = model.recommend(
            userid=0,
            user_items=user_vec,
            N=K,
            filter_already_liked_items=True
        )
        prec_sum += len(set(rec_idxs).intersection(true_items)) / K
    return prec_sum / len(users)

# build gt and train_conf_mat as before
cutoff   = df["event_time"].max() - pd.Timedelta(days=7)
train_df = df[df["event_time"] <= cutoff]
test_df  = df[df["event_time"]  > cutoff]

train_conf_mat = coo_matrix(
    (train_df["confidence"],
     (train_df["item_idx"], train_df["user_idx"])),
    shape=(n_items, n_users)
).tocsr()

als_train = AlternatingLeastSquares(
    factors=50, regularization=0.01, iterations=15, use_gpu=False
)
als_train.fit(train_conf_mat)

gt           = build_ground_truth(test_df)
sample_users = sample_users(gt, k=100, seed=42)

print(f"Sampling {len(sample_users)} users for Precision@10…")
start = time.perf_counter()
mean_prec = precision_at_k(als_train, train_conf_mat, gt, sample_users, K=10)
elapsed   = time.perf_counter() - start
print(f"Mean Precision@10 over sample: {mean_prec:.4f}")
print(f"Elapsed time: {elapsed:.1f}s ({elapsed/len(sample_users):.3f}s/user)")


In [None]:
# Cell 6: Hyperparameter grid search using 100-user sample

from implicit.als import AlternatingLeastSquares

# Reuse: train_conf_mat, gt, sample_users, precision_at_k
# Ensure sample_users already holds your 100-user list

param_grid = {
    "factors": [32, 64, 128],
    "regularization": [0.001, 0.01, 0.1],
}

best_prec = 0.0
best_params = None

print("Starting grid search over",
      len(param_grid["factors"]) * len(param_grid["regularization"]), "configs")

for f in param_grid["factors"]:
    for reg in param_grid["regularization"]:
        print(f"\nTraining ALS with factors={f}, regularization={reg}…")
        model = AlternatingLeastSquares(
            factors=f,
            regularization=reg,
            iterations=15,
            use_gpu=False
        )
        model.fit(train_conf_mat)
        prec = precision_at_k(model, train_conf_mat, gt, sample_users, K=10)
        print(f"→ Precision@10 = {prec:.4f}")
        if prec > best_prec:
            best_prec = prec
            best_params = (f, reg)

print("\nBest config:")
print(f"  factors   = {best_params[0]}")
print(f"  regularization = {best_params[1]}")
print(f"  Precision@10   = {best_prec:.4f}")


In [None]:
# Cell 7 ALS on full data & single‐user demo

from implicit.als import AlternatingLeastSquares
import random

# 1) Retrain best model on the FULL 10 M interactions
final_als = AlternatingLeastSquares(
    factors=32,        # best from your grid search
    regularization=0.001,
    iterations=15,
    use_gpu=False
)
final_als.fit(conf_mat)   # conf_mat: item×user CSR on all 10 M rows

# 2) Build the full user×item CSR once
full_user_items = conf_mat.T.tocsr()

# 3) Pick a random user
orig_user = random.choice(df["user_idx"].unique())
print(f"Random user for user_idx = {orig_user}")

# 4) Slice out that one row so recommend() sees a 1×n_items CSR
user_vector = full_user_items[orig_user, :]

# 5) Now call recommend with userid=0 (since user_vector has exactly 1 row)
item_idxs, scores = final_als.recommend(
    userid=0,
    user_items=user_vector,
    N=10,
    filter_already_liked_items=True
)

# 6) Print out product_ids and scores, casting idx to int
for idx, score in zip(item_idxs, scores):
    pid = inv_item_map.get(int(idx))
    if pid is None:
        # simply skip any truly unknown indices
        continue
    print(f"  product_id={pid}  score={score:.3f}")
