In [1]:
import sys
!{sys.executable} -m pip install requests



In [2]:
"""
Data Preprocessing Pipeline
DSC256 Assignment 2 
"""

import sys
sys.path.append('../src')

from data_loader import (
    download_amazon_reviews,
    load_reviews_from_parquet,
    preprocess_reviews,
    create_user_item_mappings,
    create_train_test_split,
    save_processed_data,
    get_dataset_statistics
)

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("Data prepocessing pipeline")
print("="*70)

Data prepocessing pipeline


In [3]:


import pandas as pd
import json
import os
from tqdm import tqdm


# Path to manually downloaded file
jsonl_file = '../data/raw/All_Beauty.jsonl'

if not os.path.exists(jsonl_file):
    print(f"\n File not found: {jsonl_file}")
    print("\nPlease download the file manually:")
    print("1. Go to: https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/tree/main/raw/review_categories")
    print("2. Click on 'All_Beauty.jsonl'")
    print("3. Download it")
    print(f"4. Save it as: {jsonl_file}")
    raise FileNotFoundError(f"Please download the file to {jsonl_file}")

print(f"\n Loading data from: {jsonl_file}")
print(f"File size: {os.path.getsize(jsonl_file) / 1024 / 1024:.2f} MB")

# Configuration
USE_SAMPLE = True
MAX_REVIEWS = 100000  

# Load JSONL file line by line
reviews = []

with open(jsonl_file, 'r', encoding='utf-8') as f:
    if USE_SAMPLE:
        print(f"  Sample mode: loading first {MAX_REVIEWS:,} reviews")
        pbar = tqdm(total=MAX_REVIEWS, desc="Loading reviews")
        
        for i, line in enumerate(f):
            if i >= MAX_REVIEWS:
                break
            try:
                review = json.loads(line)
                reviews.append(review)
                pbar.update(1)
            except json.JSONDecodeError as e:
                continue
        
        pbar.close()
    else:
        print("Loading all reviews (this may take a while)...")
        for line in tqdm(f, desc="Loading reviews"):
            try:
                review = json.loads(line)
                reviews.append(review)
            except json.JSONDecodeError:
                continue

# Convert to DataFrame
df_raw = pd.DataFrame(reviews)

print(f"\n Loaded {len(df_raw):,} reviews")

# Save as standardized parquet file
raw_file = '../data/raw/All_Beauty_reviews.parquet'
df_raw.to_parquet(raw_file, index=False)

print(f" Saved to: {raw_file}")

print("\n" + "="*70)
print("DATA PREVIEW")
print("="*70)
print(df_raw.head())

print("\n" + "="*70)
print("COLUMN NAMES")
print("="*70)
print(df_raw.columns.tolist())

print("\n" + "="*70)
print("DATA TYPES")
print("="*70)
print(df_raw.dtypes)

print("\n" + "="*70)
print(" Data loaded successfully! Ready for preprocessing.")
print("="*70)


 Loading data from: ../data/raw/All_Beauty.jsonl
File size: 311.48 MB
  Sample mode: loading first 100,000 reviews


Loading reviews: 100%|███████| 100000/100000 [00:00<00:00, 125644.70it/s]



 Loaded 100,000 reviews
 Saved to: ../data/raw/All_Beauty_reviews.parquet

DATA PREVIEW
   rating                                      title  \
0     5.0  Such a lovely scent but not overpowering.   
1     4.0     Works great but smells a little weird.   
2     5.0                                       Yes!   
3     1.0                          Synthetic feeling   
4     5.0                                         A+   

                                                text images        asin  \
0  This spray is really nice. It smells really go...     []  B00YQ6X8EO   
1  This product does what I need it to do, I just...     []  B081TJ8YS3   
2                          Smells good, feels great!     []  B07PNNCSP9   
3                                     Felt synthetic     []  B09JS339BZ   
4                                            Love it     []  B08BZ63GMJ   

  parent_asin                       user_id      timestamp  helpful_vote  \
0  B00YQ6X8EO  AGKHLEW2SOWHNMFQIJGBECAF7INQ  15

In [4]:
df_raw = load_reviews_from_parquet(raw_file)

# Show first few rows
df_raw.head()

Loading reviews from: ../data/raw/All_Beauty_reviews.parquet
Loaded 100,000 reviews


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True
3,1.0,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1643393630220,0,True
4,5.0,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1609322563534,0,True


In [5]:
df_clean = preprocess_reviews(
    df_raw,
    min_user_reviews=2, 
    min_item_reviews=2   
)



Processing data

Original dataset: 100,000 reviews
After removing invalid ratings (0 or >5): 100,000 reviews (100.0%)

Applying k-core filtering (min_user=2, min_item=2)...
  Iteration 1: 21,515 reviews (removed 78,485)
  Iteration 2: 14,597 reviews (removed 6,918)
  Iteration 3: 12,105 reviews (removed 2,492)
  Iteration 4: 11,200 reviews (removed 905)
  Iteration 5: 10,724 reviews (removed 476)
  Iteration 6: 10,514 reviews (removed 210)
  Iteration 7: 10,390 reviews (removed 124)
  Iteration 8: 10,332 reviews (removed 58)
  Iteration 9: 10,292 reviews (removed 40)
  Iteration 10: 10,271 reviews (removed 21)
  Iteration 11: 10,259 reviews (removed 12)
  Iteration 12: 10,252 reviews (removed 7)
  Iteration 13: 10,248 reviews (removed 4)
  Iteration 14: 10,247 reviews (removed 1)
  Iteration 15: 10,246 reviews (removed 1)
  Iteration 16: 10,245 reviews (removed 1)
  Iteration 17: 10,245 reviews (removed 0)

After k-core: 10,245 reviews (10.2% retained)
Unique users: 3,543
Unique items

In [6]:

user_to_idx, item_to_idx = create_user_item_mappings(df_clean)



CREATING ID MAPPINGS

Created mappings:
  Users: 3,543
  Items: 2,998


In [7]:

train_df, test_df = create_train_test_split(
    df_clean,
    test_size=0.2,
    random_state=42
)



CREATING TRAIN/TEST SPLIT

Split: 80% train / 20% test
  Train: 8,196 reviews
  Test:  2,049 reviews


In [8]:
# Save all processed data
save_processed_data(
    train_df,
    test_df,
    user_to_idx,
    item_to_idx,
    output_dir='../data/processed'
)



SAVING PROCESSED DATA

Saving to: ../data/processed
Saved files:
  - train.csv & train.parquet
  - test.csv & test.parquet
  - user_to_idx.pkl (3,543 users)
  - item_to_idx.pkl (2,998 items)
  - metadata.pkl


In [9]:
# Generate comprehensive statistics
stats = get_dataset_statistics(train_df, test_df, user_to_idx, item_to_idx)

print("\n" + "="*70)
print("DATASET STATISTICS SUMMARY")
print("="*70)

print(f"\n Dataset Overview:")
print(f"  Total reviews: {stats['n_reviews_total']:,}")
print(f"  Train reviews: {stats['n_reviews_train']:,}")
print(f"  Test reviews:  {stats['n_reviews_test']:,}")
print(f"  Unique users:  {stats['n_users']:,}")
print(f"  Unique items:  {stats['n_items']:,}")

print(f"\n Sparsity:")
print(f"  Density:  {stats['density']*100:.6f}%")
print(f"  Sparsity: {stats['sparsity']*100:.4f}%")

print(f"\n Rating Statistics:")
print(f"  Mean: {stats['rating_mean']:.3f}")
print(f"  Std:  {stats['rating_std']:.3f}")
print(f"  Range: [{stats['rating_min']:.1f}, {stats['rating_max']:.1f}]")

print(f"\n User Activity:")
print(f"  Mean reviews per user:   {stats['reviews_per_user_mean']:.2f}")
print(f"  Median reviews per user: {stats['reviews_per_user_median']:.0f}")
print(f"  Max reviews by one user: {stats['reviews_per_user_max']:,}")

print(f"\n Item Popularity:")
print(f"  Mean reviews per item:   {stats['reviews_per_item_mean']:.2f}")
print(f"  Median reviews per item: {stats['reviews_per_item_median']:.0f}")
print(f"  Max reviews for one item: {stats['reviews_per_item_max']:,}")

print(f"\n Rating Distribution:")
for rating, count in sorted(stats['rating_distribution'].items()):
    pct = count / stats['n_reviews_total'] * 100
    print(f"  {rating} stars: {count:,} ({pct:.1f}%)")

print("\n" + "="*70)
print("✅ PREPROCESSING COMPLETE!")
print("="*70)
print("from src.data_loader import load_splits")
print("train, test, user_map, item_map = load_splits()")


DATASET STATISTICS SUMMARY

 Dataset Overview:
  Total reviews: 10,245
  Train reviews: 8,196
  Test reviews:  2,049
  Unique users:  3,543
  Unique items:  2,998

 Sparsity:
  Density:  0.096452%
  Sparsity: 99.9035%

 Rating Statistics:
  Mean: 4.210
  Std:  1.191
  Range: [1.0, 5.0]

 User Activity:
  Mean reviews per user:   2.89
  Median reviews per user: 2
  Max reviews by one user: 135

 Item Popularity:
  Mean reviews per item:   3.42
  Median reviews per item: 2
  Max reviews for one item: 45

 Rating Distribution:
  1.0 stars: 645 (6.3%)
  2.0 stars: 519 (5.1%)
  3.0 stars: 993 (9.7%)
  4.0 stars: 1,967 (19.2%)
  5.0 stars: 6,121 (59.7%)

✅ PREPROCESSING COMPLETE!
from src.data_loader import load_splits
train, test, user_map, item_map = load_splits()


In [10]:
# Verify files were created
import os

processed_dir = '../data/processed'

files_to_check = [
    'train.csv',
    'train.parquet',
    'test.csv',
    'test.parquet',
    'user_to_idx.pkl',
    'item_to_idx.pkl',
    'metadata.pkl'
]

all_good = True
for filename in files_to_check:
    filepath = os.path.join(processed_dir, filename)
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / 1024 / 1024
        print(f" {filename:20s} ({size_mb:6.2f} MB)")
    else:
        print(f" {filename:20s} MISSING!")
        all_good = False

if all_good:
    print("\n All files created successfully!")
    print("\n Files location: C:\\Users\\momo9\\Desktop\\DSC256R_Assignment2\\data\\processed")
else:
    print("\n Some files are missing!")

 train.csv            (  4.56 MB)
 train.parquet        (  2.34 MB)
 test.csv             (  1.07 MB)
 test.parquet         (  0.59 MB)
 user_to_idx.pkl      (  0.11 MB)
 item_to_idx.pkl      (  0.05 MB)
 metadata.pkl         (  0.00 MB)

 All files created successfully!

 Files location: C:\Users\momo9\Desktop\DSC256R_Assignment2\data\processed


In [11]:
# Check for 0 or missing ratings
print("Investigating rating values...")
print("="*70)

print("\nRating value counts (including 0):")
print(df_raw['rating'].value_counts(dropna=False).sort_index())

print("\nNumber of 0 ratings:", (df_raw['rating'] == 0).sum())
print("Number of NaN ratings:", df_raw['rating'].isna().sum())

print("\nSample of reviews with rating = 0:")
zero_ratings = df_raw[df_raw['rating'] == 0]
if len(zero_ratings) > 0:
    print(zero_ratings[['rating', 'user_id', 'parent_asin', 'text']].head())
else:
    print("No 0 ratings found")

print("\n" + "="*70)

Investigating rating values...

Rating value counts (including 0):
rating
1.0    11042
2.0     6135
3.0     9093
4.0    13156
5.0    60574
Name: count, dtype: int64

Number of 0 ratings: 0
Number of NaN ratings: 0

Sample of reviews with rating = 0:
No 0 ratings found

