In [2]:
import sys
!{sys.executable} -m pip install requests



In [3]:
!mkdir -p /src
!mkdir -p /data/raw
!mkdir -p /data/processed
!mkdir -p /models/processed

In [4]:
"""
Data Preprocessing Pipeline
DSC256 Assignment 2
"""

import sys
sys.path.append('../src')

from data_loader import (
    download_amazon_reviews,
    load_reviews_from_parquet,
    preprocess_reviews,
    create_user_item_mappings,
    create_train_test_split,
    save_processed_data,
    get_dataset_statistics
)

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("Data prepocessing pipeline")
print("="*70)

Data prepocessing pipeline


In [18]:


import pandas as pd
import json
import os
from tqdm import tqdm


# Path to manually downloaded file
jsonl_file = '/data/raw/All_Beauty.jsonl'

if not os.path.exists(jsonl_file):
    print(f"\n File not found: {jsonl_file}")
    print("\nPlease download the file manually:")
    print("1. Go to: https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/tree/main/raw/review_categories")
    print("2. Click on 'All_Beauty.jsonl'")
    print("3. Download it")
    print(f"4. Save it as: {jsonl_file}")
    raise FileNotFoundError(f"Please download the file to {jsonl_file}")

print(f"\n Loading data from: {jsonl_file}")
print(f"File size: {os.path.getsize(jsonl_file) / 1024 / 1024:.2f} MB")

# Configuration
USE_SAMPLE = False
MAX_REVIEWS = 1000000

# Load JSONL file line by line
reviews = []

with open(jsonl_file, 'r', encoding='utf-8') as f:
    if USE_SAMPLE:
        print(f"  Sample mode: loading first {MAX_REVIEWS:,} reviews")
        pbar = tqdm(total=MAX_REVIEWS, desc="Loading reviews")

        for i, line in enumerate(f):
            if i >= MAX_REVIEWS:
                break
            try:
                review = json.loads(line)
                reviews.append(review)
                pbar.update(1)
            except json.JSONDecodeError as e:
                # print(e)
                continue

        pbar.close()
    else:
        print("Loading all reviews (this may take a while)...")
        for line in tqdm(f, desc="Loading reviews"):
            try:
                review = json.loads(line)
                reviews.append(review)
            except json.JSONDecodeError:
                continue

# Convert to DataFrame
df_raw = pd.DataFrame(reviews)

print(f"\n Loaded {len(df_raw):,} reviews")

# Save as standardized parquet file
raw_file = '../data/raw/All_Beauty_reviews.parquet'
df_raw.to_parquet(raw_file, index=False)

print(f" Saved to: {raw_file}")

print("\n" + "="*70)
print("DATA PREVIEW")
print("="*70)
print(df_raw.head())

print("\n" + "="*70)
print("COLUMN NAMES")
print("="*70)
print(df_raw.columns.tolist())

print("\n" + "="*70)
print("DATA TYPES")
print("="*70)
print(df_raw.dtypes)

print("\n" + "="*70)
print(" Data loaded successfully! Ready for preprocessing.")
print("="*70)


 Loading data from: /data/raw/All_Beauty.jsonl
File size: 257.00 MB
Loading all reviews (this may take a while)...


Loading reviews: 606279it [00:10, 58419.97it/s]



 Loaded 606,278 reviews
 Saved to: ../data/raw/All_Beauty_reviews.parquet

DATA PREVIEW
   rating                                      title  \
0     5.0  Such a lovely scent but not overpowering.   
1     4.0     Works great but smells a little weird.   
2     5.0                                       Yes!   
3     1.0                          Synthetic feeling   
4     5.0                                         A+   

                                                text images        asin  \
0  This spray is really nice. It smells really go...     []  B00YQ6X8EO   
1  This product does what I need it to do, I just...     []  B081TJ8YS3   
2                          Smells good, feels great!     []  B07PNNCSP9   
3                                     Felt synthetic     []  B09JS339BZ   
4                                            Love it     []  B08BZ63GMJ   

  parent_asin                       user_id      timestamp  helpful_vote  \
0  B00YQ6X8EO  AGKHLEW2SOWHNMFQIJGBECAF7INQ  15

In [19]:
df_raw = load_reviews_from_parquet(raw_file)

# Show first few rows
df_raw.head()

Loading reviews from: ../data/raw/All_Beauty_reviews.parquet
Loaded 606,278 reviews


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True
3,1.0,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1643393630220,0,True
4,5.0,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1609322563534,0,True


In [20]:
df_clean = preprocess_reviews(
    df_raw,
    min_user_reviews=2,
    min_item_reviews=2
)



Processing data

Original dataset: 606,278 reviews
After removing invalid ratings (0 or >5): 606,278 reviews (100.0%)

Applying k-core filtering (min_user=2, min_item=2)...
  Iteration 1: 100,029 reviews (removed 506,249)
  Iteration 2: 79,554 reviews (removed 20,475)
  Iteration 3: 70,064 reviews (removed 9,490)
  Iteration 4: 67,837 reviews (removed 2,227)
  Iteration 5: 66,461 reviews (removed 1,376)
  Iteration 6: 66,087 reviews (removed 374)
  Iteration 7: 65,844 reviews (removed 243)
  Iteration 8: 65,770 reviews (removed 74)
  Iteration 9: 65,726 reviews (removed 44)
  Iteration 10: 65,708 reviews (removed 18)
  Iteration 11: 65,696 reviews (removed 12)
  Iteration 12: 65,690 reviews (removed 6)
  Iteration 13: 65,686 reviews (removed 4)
  Iteration 14: 65,683 reviews (removed 3)
  Iteration 15: 65,682 reviews (removed 1)
  Iteration 16: 65,681 reviews (removed 1)
  Iteration 17: 65,680 reviews (removed 1)
  Iteration 18: 65,680 reviews (removed 0)

After k-core: 65,680 reviews

In [21]:

user_to_idx, item_to_idx = create_user_item_mappings(df_clean)



CREATING ID MAPPINGS

Created mappings:
  Users: 26,269
  Items: 13,738


In [22]:

train_df, test_df = create_train_test_split(
    df_clean,
    test_size=0.2,
    random_state=42
)



CREATING TRAIN/TEST SPLIT

Split: 80% train / 20% test
  Train: 52,544 reviews
  Test:  13,136 reviews


In [23]:
# Save all processed data
save_processed_data(
    train_df,
    test_df,
    user_to_idx,
    item_to_idx,
    output_dir='/data/processed'
)



SAVING PROCESSED DATA

Saving to: /data/processed
Saved files:
  - train.csv & train.parquet
  - test.csv & test.parquet
  - user_to_idx.pkl (26,269 users)
  - item_to_idx.pkl (13,738 items)
  - metadata.pkl


In [24]:
# Generate comprehensive statistics
stats = get_dataset_statistics(train_df, test_df, user_to_idx, item_to_idx)

print("\n" + "="*70)
print("DATASET STATISTICS SUMMARY")
print("="*70)

print(f"\n Dataset Overview:")
print(f"  Total reviews: {stats['n_reviews_total']:,}")
print(f"  Train reviews: {stats['n_reviews_train']:,}")
print(f"  Test reviews:  {stats['n_reviews_test']:,}")
print(f"  Unique users:  {stats['n_users']:,}")
print(f"  Unique items:  {stats['n_items']:,}")

print(f"\n Sparsity:")
print(f"  Density:  {stats['density']*100:.6f}%")
print(f"  Sparsity: {stats['sparsity']*100:.4f}%")

print(f"\n Rating Statistics:")
print(f"  Mean: {stats['rating_mean']:.3f}")
print(f"  Std:  {stats['rating_std']:.3f}")
print(f"  Range: [{stats['rating_min']:.1f}, {stats['rating_max']:.1f}]")

print(f"\n User Activity:")
print(f"  Mean reviews per user:   {stats['reviews_per_user_mean']:.2f}")
print(f"  Median reviews per user: {stats['reviews_per_user_median']:.0f}")
print(f"  Max reviews by one user: {stats['reviews_per_user_max']:,}")

print(f"\n Item Popularity:")
print(f"  Mean reviews per item:   {stats['reviews_per_item_mean']:.2f}")
print(f"  Median reviews per item: {stats['reviews_per_item_median']:.0f}")
print(f"  Max reviews for one item: {stats['reviews_per_item_max']:,}")

print(f"\n Rating Distribution:")
for rating, count in sorted(stats['rating_distribution'].items()):
    pct = count / stats['n_reviews_total'] * 100
    print(f"  {rating} stars: {count:,} ({pct:.1f}%)")

print("\n" + "="*70)
print("✅ PREPROCESSING COMPLETE!")
print("="*70)
print("from src.data_loader import load_splits")
print("train, test, user_map, item_map = load_splits()")


DATASET STATISTICS SUMMARY

 Dataset Overview:
  Total reviews: 65,680
  Train reviews: 52,544
  Test reviews:  13,136
  Unique users:  26,269
  Unique items:  13,738

 Sparsity:
  Density:  0.018200%
  Sparsity: 99.9818%

 Rating Statistics:
  Mean: 4.142
  Std:  1.314
  Range: [1.0, 5.0]

 User Activity:
  Mean reviews per user:   2.50
  Median reviews per user: 2
  Max reviews by one user: 146

 Item Popularity:
  Mean reviews per item:   4.78
  Median reviews per item: 3
  Max reviews for one item: 199

 Rating Distribution:
  1.0 stars: 6,047 (9.2%)
  2.0 stars: 3,497 (5.3%)
  3.0 stars: 5,896 (9.0%)
  4.0 stars: 9,863 (15.0%)
  5.0 stars: 40,377 (61.5%)

✅ PREPROCESSING COMPLETE!
from src.data_loader import load_splits
train, test, user_map, item_map = load_splits()


In [25]:
# Verify files were created
import os

processed_dir = '../data/processed'

files_to_check = [
    'train.csv',
    'train.parquet',
    'test.csv',
    'test.parquet',
    'user_to_idx.pkl',
    'item_to_idx.pkl',
    'metadata.pkl'
]

all_good = True
for filename in files_to_check:
    filepath = os.path.join(processed_dir, filename)
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / 1024 / 1024
        print(f" {filename:20s} ({size_mb:6.2f} MB)")
    else:
        print(f" {filename:20s} MISSING!")
        all_good = False

if all_good:
    print("\n All files created successfully!")
    print("\n Files location: C:\\Users\\momo9\\Desktop\\DSC256R_Assignment2\\data\\processed")
else:
    print("\n Some files are missing!")

 train.csv            ( 23.41 MB)
 train.parquet        ( 11.76 MB)
 test.csv             (  5.78 MB)
 test.parquet         (  3.13 MB)
 user_to_idx.pkl      (  0.85 MB)
 item_to_idx.pkl      (  0.21 MB)
 metadata.pkl         (  0.00 MB)

 All files created successfully!

 Files location: C:\Users\momo9\Desktop\DSC256R_Assignment2\data\processed


In [26]:
# Check for 0 or missing ratings
print("Investigating rating values...")
print("="*70)

print("\nRating value counts (including 0):")
print(df_raw['rating'].value_counts(dropna=False).sort_index())

print("\nNumber of 0 ratings:", (df_raw['rating'] == 0).sum())
print("Number of NaN ratings:", df_raw['rating'].isna().sum())

print("\nSample of reviews with rating = 0:")
zero_ratings = df_raw[df_raw['rating'] == 0]
if len(zero_ratings) > 0:
    print(zero_ratings[['rating', 'user_id', 'parent_asin', 'text']].head())
else:
    print("No 0 ratings found")

print("\n" + "="*70)

Investigating rating values...

Rating value counts (including 0):
rating
1.0     83279
2.0     37382
3.0     49984
4.0     70115
5.0    365518
Name: count, dtype: int64

Number of 0 ratings: 0
Number of NaN ratings: 0

Sample of reviews with rating = 0:
No 0 ratings found



# part2

In [27]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import mean_squared_error

# load data
train = pd.read_csv('/data/processed/train.csv')
test = pd.read_csv('/data/processed/test.csv')

print('train size:', len(train))
print('test size:', len(test))

train size: 52544
test size: 13136


In [28]:
ratingsPerUser_bias = defaultdict(list)
ratingsPerItem_bias = defaultdict(list)

for _, row in train.iterrows():
    u = row['user_id']
    i = row['item_id']
    r = row['rating']
    ratingsPerUser_bias[u].append((i,r))
    ratingsPerItem_bias[i].append((u,r))

In [29]:
# Bias Model Training
alpha = train['rating'].mean()
betaU = defaultdict(float)
betaI = defaultdict(float)
lamb = 5
max_iter = 20

def update_alpha(train, betaU, betaI):
    numer = 0
    for idx, row in train.iterrows():
        u = row['user_id']
        i = row['item_id']
        r = row['rating']
        numer += r - (betaU.get(u,0) + betaI.get(i,0))
    return numer / len(train)

def update_betaU(ratingsPerUser, alpha, betaI, lamb):
    newBetaU = {}
    for u, items in ratingsPerUser.items():
        numer = 0
        for (i, r) in items:
            numer += r - (alpha + betaI.get(i,0))
        newBetaU[u] = numer / (lamb + len(items))
    return newBetaU

def update_betaI(ratingsPerItem, alpha, betaU, lamb):
    newBetaI = {}
    for i, users in ratingsPerItem.items():
        numer = 0
        for (u, r) in users:
            numer += r - (alpha + betaU.get(u,0))
        newBetaI[i] = numer / (lamb + len(users))
    return newBetaI

In [30]:
for t in range(max_iter):
    alpha = update_alpha(train, betaU, betaI)
    betaU = update_betaU(ratingsPerUser_bias, alpha, betaI, lamb)
    betaI = update_betaI(ratingsPerItem_bias, alpha, betaU, lamb)

    # compute train MSE
    mse = 0
    for _, row in train.iterrows():
        u = row['user_id']
        i = row['item_id']
        r = row['rating']
        pred = alpha + betaU.get(u,0) + betaI.get(i,0)
        pred = max(1, min(5, pred))
        mse += (r - pred)**2
    mse /= len(train)
    print(f"Iter {t+1}: alpha={alpha:.2f}, MSE={mse:.2f}")


Iter 1: alpha=4.14, MSE=0.88
Iter 2: alpha=4.12, MSE=0.90
Iter 3: alpha=4.11, MSE=0.90
Iter 4: alpha=4.10, MSE=0.90
Iter 5: alpha=4.10, MSE=0.90
Iter 6: alpha=4.10, MSE=0.90
Iter 7: alpha=4.09, MSE=0.90
Iter 8: alpha=4.09, MSE=0.90
Iter 9: alpha=4.09, MSE=0.90
Iter 10: alpha=4.09, MSE=0.90
Iter 11: alpha=4.09, MSE=0.90
Iter 12: alpha=4.09, MSE=0.90
Iter 13: alpha=4.09, MSE=0.90
Iter 14: alpha=4.09, MSE=0.90
Iter 15: alpha=4.09, MSE=0.90
Iter 16: alpha=4.09, MSE=0.90
Iter 17: alpha=4.09, MSE=0.90
Iter 18: alpha=4.09, MSE=0.90
Iter 19: alpha=4.09, MSE=0.90
Iter 20: alpha=4.09, MSE=0.90


In [31]:
test_preds = []

for _, row in test.iterrows():
    u = row['user_id']
    i = row['item_id']
    pred = alpha + betaU.get(u,0) + betaI.get(i,0)
    pred = max(1.0, min(5.0, pred))
    test_preds.append(pred)

mse_test = mean_squared_error(test['rating'], test_preds)
print("Bias model Test MSE:", mse_test)

Bias model Test MSE: 1.4696601040307908


In [37]:
# early version of model, will add image encoding vector later to try and further
# improve it.

In [32]:
train_df["user_idx"] = train["user_id"].map(user_to_idx).astype(int)
train_df["item_idx"] = train["item_id"].map(item_to_idx).astype(int)

test_df["user_idx"] = test["user_id"].map(user_to_idx).astype(int)
test_df["item_idx"] = test["item_id"].map(item_to_idx).astype(int)


In [35]:
train0 = list(zip(
    train_df["user_idx"].values,
    train_df["item_idx"].values,
    train_df["rating"].astype(float).values
))
test0 = list(zip(
    test_df["user_idx"].values,
    test_df["item_idx"].values,
    test_df["rating"].astype(float).values
))
k=20
lr=0.01
reg=0.05
img_dim=32
P = np.random.normal(0, 0.1, (len(user_to_idx), k))
Q = np.random.normal(0, 0.1, (len(item_to_idx), k))
betaU = np.zeros(len(user_to_idx))
betaI = np.zeros(len(item_to_idx))
alpha = train['rating'].mean()

for ep in range(10):
    np.random.shuffle(train0)
    for u, i, r in train0:
        pred = alpha + betaU[u] + betaI[i] + np.dot(P[u], Q[i])
        pred = np.clip(pred, 1, 5)
        err = r - pred

        betaU[u] += lr*(err - reg*betaU[u])
        betaI[i] += lr*(err - reg*betaI[i])
        P[u] += lr*(err*Q[i] - reg*P[u])
        Q[i] += lr*(err*P[u] - reg*Q[i])



wz = np.zeros(img_dim)
for ep in range(10):
    np.random.shuffle(train0)
    for u,i,r in train0:
        pred = alpha + betaU[u] + betaI[i] + np.dot(P[u],Q[i])
        err = r - pred

        betaU[u] += lr*(err - reg*betaU[u])
        betaI[i] += lr*(err - reg*betaI[i])
        P[u]   += lr*(err*Q[i] - reg*P[u])
        Q[i]   += lr*(err*P[u] - reg*Q[i])
        # wz stays zero

# Evaluate
preds = []
true = []
for u,i,r in test0:
    pred = alpha + betaU[u] + betaI[i] + np.dot(P[u],Q[i])
    preds.append(np.clip(pred,1,5))
    true.append(r)
v1mse=mean_squared_error(true, preds)
print("Test MSE:", v1mse)

Test MSE: 1.4364589030088477


In [36]:
print("Do we beat baseline?")
print("yes") if v1mse<mse_test else print("no")

Do we beat baseline?
yes
