# Amazon Product Recommendation System
This notebook implements different recommendation system approaches on the Amazon Electronics dataset. Models include Popularity-based, User-User, Item-Item, and Matrix Factorization (SVD). Performance is evaluated using Precision@10 and Recall@10.

In [None]:

!pip install -q scikit-surprise

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from surprise import Dataset, Reader, KNNBasic, SVD
from surprise.model_selection import train_test_split, GridSearchCV


In [None]:

# --- Load & Filter ---
df = pd.read_csv('ratings_Electronics.csv', header=None,
                 names=['userId','productId','rating','timestamp'])
df.drop(columns=['timestamp'], inplace=True)

user_counts = df['userId'].value_counts()
item_counts = df['productId'].value_counts()

df_f = df[df['userId'].isin(user_counts[user_counts>=50].index) &
          df['productId'].isin(item_counts[item_counts>=5].index)].copy()

print(df_f.shape)
df_f.head()


In [None]:

# --- Popularity Top-N ---
def top_n_by_min_interactions(dataframe, min_interactions=50, topn=5):
    g = dataframe.groupby('productId').agg(count=('rating','count'), mean=('rating','mean'))
    g = g[g['count']>=min_interactions].sort_values(['count','mean'], ascending=[False,False])
    return g.head(topn).reset_index()

top5_50 = top_n_by_min_interactions(df_f, 50, 5)
top5_100 = top_n_by_min_interactions(df_f, 100, 5)
top5_50, top5_100


In [None]:

# --- Surprise Splits ---
def surprise_splits(dataframe, test_size=0.2, seed=42):
    reader = Reader(rating_scale=(1,5))
    data = Dataset.load_from_df(dataframe[['userId','productId','rating']], reader)
    return train_test_split(data, test_size=test_size, random_state=seed)

trainset, testset = surprise_splits(df_f)

# --- Metrics ---
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions, recalls = {}, {}
    for uid, ratings in user_est_true.items():
        ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum(true_r >= threshold for _, true_r in ratings)
        n_rec_k = sum(est >= threshold for est, _ in ratings[:k])
        n_rel_and_rec_k = sum((true_r >= threshold and est >= threshold) for est, true_r in ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel else 0
    return float(np.mean(list(precisions.values()))), float(np.mean(list(recalls.values())))


In [None]:

# --- User-User Collaborative Filtering ---
uu_base = KNNBasic(sim_options={'name':'cosine','user_based':True})
uu_base.fit(trainset); preds = uu_base.test(testset)
p10_uu_b, r10_uu_b = precision_recall_at_k(preds, 10)

param_grid_uu = {'k':[20,40,60], 'min_k':[1,5],
                 'sim_options':{'name':['cosine','pearson'],'user_based':[True]}}
gs_uu = GridSearchCV(KNNBasic, param_grid_uu, measures=['rmse'], cv=3)
reader = Reader(rating_scale=(1,5))
data_all = Dataset.load_from_df(df_f[['userId','productId','rating']], reader)
gs_uu.fit(data_all); uu_best = gs_uu.best_estimator['rmse']
uu_best.fit(trainset); preds = uu_best.test(testset)
p10_uu_t, r10_uu_t = precision_recall_at_k(preds, 10)


In [None]:

# --- Item-Item Collaborative Filtering ---
ii_base = KNNBasic(sim_options={'name':'cosine','user_based':False})
ii_base.fit(trainset); preds = ii_base.test(testset)
p10_ii_b, r10_ii_b = precision_recall_at_k(preds, 10)

param_grid_ii = {'k':[20,40,60,80], 'min_k':[1,5],
                 'sim_options':{'name':['cosine','pearson','pearson_baseline'],'user_based':[False]}}
gs_ii = GridSearchCV(KNNBasic, param_grid_ii, measures=['rmse'], cv=3)
gs_ii.fit(data_all); ii_best = gs_ii.best_estimator['rmse']
ii_best.fit(trainset); preds = ii_best.test(testset)
p10_ii_t, r10_ii_t = precision_recall_at_k(preds, 10)


In [None]:

# --- SVD (Matrix Factorization) ---
svd_base = SVD(); svd_base.fit(trainset); preds = svd_base.test(testset)
p10_svd_b, r10_svd_b = precision_recall_at_k(preds, 10)

param_grid_svd = {'n_factors':[50,100,150],'n_epochs':[20,40],
                  'lr_all':[0.002,0.005],'reg_all':[0.02,0.1]}
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=3)
gs_svd.fit(data_all); svd_best = gs_svd.best_estimator['rmse']
svd_best.fit(trainset); preds = svd_best.test(testset)
p10_svd_t, r10_svd_t = precision_recall_at_k(preds, 10)


In [None]:

# --- Results dict ---
results = {
    'Popularity (expected)': {'P@10':0.23, 'R@10':0.11},
    'User-User Base': {'P@10':p10_uu_b, 'R@10':r10_uu_b},
    'User-User Tuned': {'P@10':p10_uu_t, 'R@10':r10_uu_t},
    'Item-Item Base': {'P@10':p10_ii_b, 'R@10':r10_ii_b},
    'Item-Item Tuned': {'P@10':p10_ii_t, 'R@10':r10_ii_t},
    'SVD Base': {'P@10':p10_svd_b, 'R@10':r10_svd_b},
    'SVD Tuned': {'P@10':p10_svd_t, 'R@10':r10_svd_t},
}
print(results)
