<a href="https://colab.research.google.com/github/asalem2/E-Commerce-Product-Recommendation-System/blob/main/svd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install library and dependencies
!pip install scikit-learn pandas scipy --quiet

import pandas as pd
import numpy as np
import random
import time
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD

# 1) Load & filter the first 10 M rows
t0 = time.time()
df = pd.read_csv(
    "2020-Apr.csv.gz",
    compression="gzip",
    usecols=["user_id","product_id","event_type"],
    nrows=10_000_000,
    low_memory=False
)
df = df[df["event_type"].isin(["view","cart","purchase"])].reset_index(drop=True)
print(f"Load+filter: {time.time()-t0:.1f}s, rows={len(df):,}")

Load+filter: 29.1s, rows=10,000,000


In [2]:
# 2) Down-sample to 1/6 for speed
df = df.sample(frac=1/6, random_state=42).reset_index(drop=True)
print(f"Sample size (1/6): {len(df):,}")


Sample size (1/6): 1,666,667


In [3]:
# 3) Encode to zero-based uidx/iidx
df["uidx"], users = pd.factorize(df["user_id"])
df["iidx"], items = pd.factorize(df["product_id"])
n_users, n_items = len(users), len(items)
print(f"Users: {n_users:,}, Items: {n_items:,}")


Users: 561,283, Items: 103,861


In [4]:
# 4) Leave-one-out split
np.random.seed(42)
df["rand"]  = np.random.rand(len(df))
df["count"] = df.groupby("uidx")["uidx"].transform("count")
holdouts   = df[df["count"]>1].groupby("uidx")["rand"].idxmax()
train_df   = df.drop(index=holdouts).copy()
test_df    = df.loc[holdouts].copy()
print(f"Train rows: {len(train_df):,}, Test rows: {len(test_df):,}")


Train rows: 1,368,460, Test rows: 298,207


In [5]:
# 5) Build user×item sparse matrix
rows = train_df["uidx"].to_numpy()
cols = train_df["iidx"].to_numpy()
data = np.ones(len(train_df), dtype=np.float32)
ui_mat = coo_matrix((data, (rows, cols)), shape=(n_users, n_items)).tocsr()


In [6]:
# 6) Fit Truncated SVD
n_components = 32
svd = TruncatedSVD(n_components=n_components, random_state=42)
t0 = time.time()
user_factors = svd.fit_transform(ui_mat)  # shape (n_users, 32)
item_factors = svd.components_.T          # shape (n_items, 32)
print(f"SVD fit ({n_components} comps): {time.time()-t0:.1f}s")


SVD fit (32 comps): 36.6s


In [7]:
# 7) Intersection-based sampling for evaluation
train_users = set(train_df["uidx"])
test_users  = set(test_df["uidx"])
valid_users = list(train_users & test_users)
sample_users = random.sample(valid_users, min(1000, len(valid_users)))
print(f"Evaluating on {len(sample_users)} users")


Evaluating on 1000 users


In [8]:
# 8) Compute Precision@10 / Recall@10
K = 10
hits = 0
for u in sample_users:
    scores = user_factors[u].dot(item_factors.T)
    seen   = set(ui_mat[u].indices)
    scores[list(seen)] = -np.inf
    top_k  = np.argpartition(-scores, K)[:K]
    true_i = int(test_df.loc[test_df["uidx"]==u, "iidx"].iloc[0])
    hits  += int(true_i in top_k)

precision = hits / (K * len(sample_users))
recall    = hits / len(sample_users)
print(f"SVD Precision@{K}: {precision:.4f}")
print(f"SVD Recall@{K}:    {recall:.4f}")


SVD Precision@10: 0.0043
SVD Recall@10:    0.0430
