In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import SVD, SVDpp
from collections import defaultdict
import joblib

sns.set(style="whitegrid")

In [2]:
# Step 1: Load & Filter

processed_data_path = "../data/processed/ratings_with_detailed_users_and_books.csv"

if not os.path.exists(processed_data_path):
    raise FileNotFoundError(f"File not found: {processed_data_path}")

df = pd.read_csv(processed_data_path)
print(f"Loaded dataset. Shape: {df.shape}")

# Define thresholds: any user and item must have at least 5 ratings
# Users with fewer than 5 ratings provide very little information about their preferences, making them hard to model
# Similarly, items with fewer than 5 ratings are difficult to compare against others
min_ratings_user = 5
min_ratings_item = 5

user_counts = df.groupby('user_id')['isbn'].count()
item_counts = df.groupby('isbn')['user_id'].count()

valid_users = user_counts[user_counts >= min_ratings_user].index
valid_items = item_counts[item_counts >= min_ratings_item].index

df_filtered = df[df['user_id'].isin(valid_users) & df['isbn'].isin(valid_items)]
print(f"After filtering: {df_filtered.shape}")

df_filtered.head()

Loaded dataset. Shape: (383842, 14)
After filtering: (145890, 14)


Unnamed: 0,user_id,isbn,book_rating,user_age,user_city,user_state,user_country,book_title,book_author,book_year_of_publication,book_publisher,book_image_url_s,book_image_url_m,book_image_url_l
4,276747,60517794,9,25,IOWA CITY,IOWA,USA,Little Altars Everywhere,Rebecca Wells,2003.0,HarperTorch,http://images.amazon.com/images/P/0060517794.0...,http://images.amazon.com/images/P/0060517794.0...,http://images.amazon.com/images/P/0060517794.0...
5,276747,671537458,9,25,IOWA CITY,IOWA,USA,Waiting to Exhale,Terry McMillan,1995.0,Pocket,http://images.amazon.com/images/P/0671537458.0...,http://images.amazon.com/images/P/0671537458.0...,http://images.amazon.com/images/P/0671537458.0...
6,276747,679776818,8,25,IOWA CITY,IOWA,USA,Birdsong: A Novel of Love and War,Sebastian Faulks,1997.0,Vintage Books USA,http://images.amazon.com/images/P/0679776818.0...,http://images.amazon.com/images/P/0679776818.0...,http://images.amazon.com/images/P/0679776818.0...
33,276813,8426449476,8,29,SITGES,BARCELONA,SPAIN,El Diaro De Bridget Jones,Helen Fielding,1996.0,Lumen Espana,http://images.amazon.com/images/P/8426449476.0...,http://images.amazon.com/images/P/8426449476.0...,http://images.amazon.com/images/P/8426449476.0...
39,276822,60096195,10,11,CALGARY,ALBERTA,CANADA,The Boy Next Door,Meggin Cabot,2002.0,Avon Trade,http://images.amazon.com/images/P/0060096195.0...,http://images.amazon.com/images/P/0060096195.0...,http://images.amazon.com/images/P/0060096195.0...


In [3]:
# Step 2: Convert to Surprise

reader = Reader(rating_scale=(1, 10)) # Tells Surprise that our ratings range from 1 (lowest) to 10 (highest)
df_for_surprise = df_filtered[['user_id', 'isbn', 'book_rating']].copy()
df_for_surprise.columns = ['userID', 'itemID', 'rating']

data = Dataset.load_from_df(df_for_surprise, reader)
print("Data ready for Surprise.")

Data ready for Surprise.


- Surprise is a Python toolkit for building and evaluating recommendation algorithms using (User, Item, Rating) data.
- Surprise uses its own trainset/testset format under the hood. We can’t just pass a DataFrame directly to Surprise, we have to do this conversion.
- By specifying the rating_scale, the library knows the range of valid ratings. It helps Surprise interpret how far off a prediction is.

In [4]:
# Step 3: Baseline SVD

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
print(f"Trainset users: {trainset.n_users}, items: {trainset.n_items}, ratings: {trainset.n_ratings}")
print(f"Testset size: {len(testset)}")

svd_baseline = SVD(n_factors=50, random_state=42)
svd_baseline.fit(trainset)

predictions = svd_baseline.test(testset)
rmse_baseline = accuracy.rmse(predictions, verbose=True)
print(f"Baseline SVD RMSE: {rmse_baseline:.4f}")

Trainset users: 12097, items: 13724, ratings: 116712
Testset size: 29178
RMSE: 1.5781
Baseline SVD RMSE: 1.5781


- **SVD (Singular Value Decomposition) is** a mathematical method that can uncovers hidden patterns in user–book rating data
- **The reason we use SVD is** that it often yields better predictions with minimal inputs: just user, item, and rating
- **We define n_factors=50** to let the model capture 50 different "preference dimensions" for users and books
- **From the result, we get RMSE ≈ 1.57**, meaning the model is off by about 1.57 points on a 1–10 scale. Example: if a true rating is 8, it might guess around 6.4 or 9.6

In [5]:
# Step 4: Cross-Validation (Baseline)

cv_results = cross_validate(svd_baseline, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
avg_rmse = np.mean(cv_results['test_rmse'])
avg_mae = np.mean(cv_results['test_mae'])
print(f"3-Fold CV: RMSE={avg_rmse:.4f}, MAE={avg_mae:.4f}")

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.5738  1.5884  1.5958  1.5860  0.0091  
MAE (testset)     1.2165  1.2258  1.2291  1.2238  0.0053  
Fit time          0.90    0.88    1.00    0.92    0.05    
Test time         0.19    0.44    0.35    0.33    0.10    
3-Fold CV: RMSE=1.5860, MAE=1.2238


- We are doing a **Cross-validation** to split our data into multiple "folds" and repeatedly train/test the model on different splits.  
- **The reason we do cross-validation** is to get a more reliable measure of performance than a single train–test split can provide.  
- **We use 3 folds** so each fold acts as its own test set once, then we average results.  
- **From the output**, we see an **RMSE of about 1.5872** and **MAE of about 1.2233**, indicating how far our predictions deviate from real ratings on average (RMSE) and how large the typical absolute error is (MAE).

In [6]:
# Step 5: Hyperparameter Tuning (SVD)

param_grid_svd = {
    'n_factors': [10, 20, 50],
    'reg_all': [0.02, 0.1],
    'lr_all': [0.005, 0.01],
    'n_epochs': [10, 20] 
}

gs_svd = GridSearchCV(
    SVD,
    param_grid_svd,
    measures=['rmse', 'mae'],
    cv=3,
    n_jobs=-1,
    joblib_verbose=2
)

print("GridSearch for SVD (moderate grid)...")
gs_svd.fit(data)
print("SVD GridSearch done.\n")

print("Best RMSE (SVD):", gs_svd.best_score['rmse'])
print("Best Params (SVD):", gs_svd.best_params['rmse'])

GridSearch for SVD (moderate grid)...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 22 concurrent workers.
[Parallel(n_jobs=-1)]: Done  66 out of  72 | elapsed:   12.7s remaining:    1.1s


SVD GridSearch done.

Best RMSE (SVD): 1.5777325908336586
Best Params (SVD): {'n_factors': 10, 'reg_all': 0.1, 'lr_all': 0.01, 'n_epochs': 20}


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   13.7s finished


- We are doing **Hyperparameter Tuning for SVD** to trying different values (like `n_factors` or `n_epochs`) to find which combination yields the best performance.  
- **The reason we define a parameter grid** is so `GridSearchCV` can systematically test multiple settings such as `n_factors=[10, 20, 50]`, then tell us which ones minimize error.  
- **We found the best parameters** to be `'n_factors': 10, 'reg_all': 0.1, 'lr_all': 0.01, 'n_epochs': 20`.  
- **From the result, we get a best RMSE = 1.5789**, meaning on average our SVD model (with those parameters) is off by about 1.58 points on a 1–10 rating scale.

In [7]:
# Step 6: Train & Evaluate Best SVD

best_params_svd = gs_svd.best_params['rmse']
svd_best = SVD(
    n_factors=best_params_svd['n_factors'],
    reg_all=best_params_svd['reg_all'],
    lr_all=best_params_svd['lr_all'],
    n_epochs=best_params_svd['n_epochs'],
    random_state=42
)

# Retrain with best hyperparams
trainset_best_svd, testset_best_svd = train_test_split(data, test_size=0.2, random_state=42)
svd_best.fit(trainset_best_svd)

preds_best_svd = svd_best.test(testset_best_svd)
rmse_best_svd = accuracy.rmse(preds_best_svd, verbose=True)
print(f"SVD(Best) RMSE: {rmse_best_svd:.4f}")

RMSE: 1.5712
SVD(Best) RMSE: 1.5712


- Use the best parameters found in the previous SVD grid search and retrain to confirm how well those parameters work when fully trained on the new split.  
- **From the result, RMSE ≈ 1.5712**, which means it's off by about 1.57 points on a 1–10 rating scale, slightly better than the baseline.

In [8]:
# Step 7: Hyperparameter Tuning (SVD++)

param_grid_svdpp = {
    'n_factors': [10, 20],
    'reg_all': [0.02, 0.1],
    'lr_all': [0.005],
    'n_epochs': [10, 20]
}

gs_svdpp = GridSearchCV(
    SVDpp,
    param_grid_svdpp,
    measures=['rmse', 'mae'],
    cv=3,
    n_jobs=-1,
    joblib_verbose=2
)

print("GridSearch for SVD++ (small grid)...")
gs_svdpp.fit(data)
print("SVD++ GridSearch done.\n")

print("Best RMSE (SVD++):", gs_svdpp.best_score['rmse'])
print("Best Params (SVD++):", gs_svdpp.best_params['rmse'])

GridSearch for SVD++ (small grid)...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 22 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  24 | elapsed:   34.3s remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:   48.7s remaining:    9.7s


SVD++ GridSearch done.

Best RMSE (SVD++): 1.5815827521457855
Best Params (SVD++): {'n_factors': 10, 'reg_all': 0.1, 'lr_all': 0.005, 'n_epochs': 20}


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   56.2s finished


- **SVD++ is** an advanced version of SVD that also leverages implicit feedback such as which books a user interacted with, even without explicit ratings
- **We use a smaller parameter grid** than for regular SVD, since SVD++ can be more computationally heavy.  
- **The best RMSE found** is about **1.5826**, with parameters `{'n_factors': 10, 'reg_all': 0.1, 'lr_all': 0.005, 'n_epochs': 20}`.  
- **From this result**, SVD++ is slightly above our best SVD model's RMSE

In [9]:
# Step 8: Train & Evaluate Best SVD++

best_params_svdpp = gs_svdpp.best_params['rmse']
svdpp_best = SVDpp(
    n_factors=best_params_svdpp['n_factors'],
    reg_all=best_params_svdpp['reg_all'],
    lr_all=best_params_svdpp['lr_all'],
    n_epochs=best_params_svdpp['n_epochs'],
    random_state=42
)

# Retrain with best hyperparams
trainset_best_svdpp, testset_best_svdpp = train_test_split(data, test_size=0.2, random_state=42)
svdpp_best.fit(trainset_best_svdpp)

preds_best_svdpp = svdpp_best.test(testset_best_svdpp)
rmse_best_svdpp = accuracy.rmse(preds_best_svdpp, verbose=True)
print(f"SVD++(Best) RMSE: {rmse_best_svdpp:.4f}")

RMSE: 1.5693
SVD++(Best) RMSE: 1.5693


- Use the best parameters found in the previous SVD++ grid search and retrain to confirm how well those parameters work when fully trained on the new split.  
- **From the result, RMSE ≈ 1.5693**, which means it's off by about 1.57 points on a 1–10 rating scale, slightly better than the baseline.

In [10]:
# Step 9: Compare Models (Ranking)

def precision_recall_at_k(predictions, k=5, threshold=7):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions, recalls = {}, {}
    for uid, ratings in user_est_true.items():
        ratings.sort(key=lambda x: x[0], reverse=True)
        top_k = ratings[:k]
        n_rel = sum(true_r >= threshold for (_, true_r) in ratings)
        n_rec_k = sum(true_r >= threshold for (_, true_r) in top_k)

        precisions[uid] = n_rec_k / k
        recalls[uid] = n_rec_k / n_rel if n_rel else 0

    avg_prec = sum(precisions.values()) / len(precisions)
    avg_rec = sum(recalls.values()) / len(recalls)
    return avg_prec, avg_rec

prec_svd, rec_svd = precision_recall_at_k(preds_best_svd, k=5, threshold=7)
prec_svdpp, rec_svdpp = precision_recall_at_k(preds_best_svdpp, k=5, threshold=7)

print("----- Model Comparison -----")
print(f"SVD  RMSE: {rmse_best_svd:.4f}, Precision@5: {prec_svd:.3f}, Recall@5: {rec_svd:.3f}")
print(f"SVD++ RMSE: {rmse_best_svdpp:.4f}, Precision@5: {prec_svdpp:.3f}, Recall@5: {rec_svdpp:.3f}")


----- Model Comparison -----
SVD  RMSE: 1.5712, Precision@5: 0.377, Recall@5: 0.815
SVD++ RMSE: 1.5693, Precision@5: 0.378, Recall@5: 0.816


- **We define relevant items as those with a true rating ≥ 7.** This lets us count how many "good" items show up in the top 5 recommendations.

- **Precision@5** measures the fraction of those 5 recommended items that are actually relevant. For example, if 2 of the recommended books had a rating ≥7, then Precision@5 = 2/5 = 0.4.

- **Recall@5** measures how many of all the user's relevant items we actually caught in our top 5. If a user has 5 relevant books in total, and our top-5 list contains 4 of them, Recall@5 = 4/5 = 0.8.

- **SVD** gives RMSE=1.5712, Precision@5=0.377, and Recall@5=0.815. That means:  
  - On a 1–10 scale, we're off by about 1.57 points in rating predictions, on average.  
  - Out of the top 5 items recommended, 37.7% are truly relevant (≥7 rating), covering 81.5% of all relevant items.

- **SVD++** is slightly better, with RMSE=1.5693, Precision@5=0.378, and Recall@5=0.816, showing marginally higher rating accuracy and capturing a slightly higher fraction of relevant items in its top 5.

In [11]:
# Step 10: Recommendation & Save Model

def recommend_books_for_user(
    model,
    df_full,
    user_id,
    top_n=5,
    exclude_rated=False
):
    """
    Recommend top_n books for a given user using a trained Surprise model.
    If the user is unknown, do a fallback based on global popularity.

    :param model:       Trained Surprise model.
    :param df_full:     The Pandas DataFrame with columns [user_id, isbn, book_rating, book_title...].
    :param user_id:     The user to recommend items for.
    :param top_n:       Number of recommendations to return.
    :param exclude_rated: If True, excludes items the user has already rated.
    :return: A list of tuples: (isbn, predicted_score, title).
    """
    try:
        model.trainset.to_inner_uid(str(user_id))

        all_isbns = df_full['isbn'].unique()

        if exclude_rated:
            already_rated = df_full[df_full['user_id'] == user_id]['isbn'].unique()
            candidate_isbns = [isbn for isbn in all_isbns if isbn not in already_rated]
        else:
            candidate_isbns = all_isbns

        # 2) Predict for candidate items only
        predictions = []
        for isbn in candidate_isbns:
            pred = model.predict(str(user_id), str(isbn), verbose=False)
            predictions.append((isbn, pred.est))

        # 3) Sort descending by predicted rating
        predictions.sort(key=lambda x: x[1], reverse=True)
        top_preds = predictions[:top_n]

        # 4) Map ISBN -> Title
        isbn_to_title = df_full.drop_duplicates('isbn').set_index('isbn')['book_title'].to_dict()
        results = [(isbn, score, isbn_to_title.get(isbn, "Unknown Title")) for isbn, score in top_preds]
        return results

    except ValueError:
        top_global = (
            df_full.groupby('isbn')['book_rating']
            .mean()
            .sort_values(ascending=False)
            .head(top_n)
            .index.tolist()
        )
        isbn_to_title = df_full.drop_duplicates('isbn').set_index('isbn')['book_title'].to_dict()
        return [(isbn, 0.0, isbn_to_title.get(isbn, "Unknown Title")) for isbn in top_global]

# Choose best model from ranking or RMSE
model_choice = svd_best  # or svdpp_best

user_example = df_filtered['user_id'].iloc[0]
top_n = 5
top_recs = recommend_books_for_user(model_choice, df_filtered, user_example, top_n=top_n)

print(f"Top {top_n} for user {user_example} with chosen model:")
for isbn, score, title in top_recs:
    print(f"  ISBN: {isbn}, Score={score:.2f}, Title={title}")

model_path = "../models/recommender_model.joblib"
os.makedirs("../models", exist_ok=True)
joblib.dump(model_choice, model_path)
print(f"Final model saved to {model_path}")

Top 5 for user 276747 with chosen model:
  ISBN: 0375502971, Score=0.00, Title=A Dog Year: Twelve Months, Four Dogs, and Me
  ISBN: 3257204981, Score=0.00, Title=Der Vater Eines Morders
  ISBN: 089471838X, Score=0.00, Title=Natural California: A Postcard Book
  ISBN: 0689851324, Score=0.00, Title=Homecoming
  ISBN: 0312253737, Score=0.00, Title=The Basic Eight
Final model saved to ../models/recommender_model.joblib


- Function **`recommend_books_for_user`** takes a trained model and a user ID, then returns top-N recommended books.  
- **2 scenarios in the function**: if the user is known, predict ratings for all items. If the user is unknown, fallback to globally popular books.  
- **Why exclude_rated?** Sometimes we want to skip items the user already rated (in production mode), so they only see new suggestions.  
- **In this example**, we pick `svd_best` as our final model and call `recommend_books_for_user` with `top_n=5`.  
- **From the result**, the model recommends five ISBNs with a score of 0.0, which suggests it had limited info on that user–item combination (or the model didn't predict a higher rating). In a real scenario, we might see non-zero scores if there's more overlap in the training data.  
- Model saved to `../models/recommender_model.joblib`