# Amazon Reviews Sentiment Analysis
## BTT Cadence 2B

This code extracts actionable business insights from customer product reviews
Dataset: https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023
Aim: Identifiy the most liked/criticized product features for improvement prioritization based on market needs

## Installations & Imports

In [1]:
# Parsing and cleaning HTML/XML content
!pip install beautifulsoup4
# Fast text parser
!pip install lxml



In [225]:
print("Package versions:")
# Perform operating system tasks
import os

# Manually frees memory through deletion
import gc

# Provide interpreter information
import sys

# File operations
import shutil

# Regular expressions for pattern matching
import re

# Counting hashable objects
from collections import Counter, defaultdict

# Type hints
from typing import Union, List, Any, Set

# Numerical computing and data manipulation
import pandas as pd
import numpy as np

# Hugging face dataset loading and preprocessing
from datasets import Dataset, load_dataset
from huggingface_hub import hf_hub_download
import pyarrow.parquet as pq

# Transformer models and inference
import transformers

# Parse and remove HTML tags/artifacts from text
from bs4 import BeautifulSoup

# Generating train-test splits and accuracy scores
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Compute class weights for unbalanced datasets
from sklearn.utils.class_weight import compute_class_weight

# Wrapper for faster training
from unsloth import FastModel, is_bfloat16_supported

# Automatic model loader for classification
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, AutoTokenizer, AutoModelForCausalLM

# For deep learning operations and GPU acceleration
import torch

# Progress bar to track batch processing
from tqdm import tqdm


print(f"Python: {sys.version}")
print(f"pandas: {pd.__version__}")
print(f"numpy: {np.__version__}")
print(f"transformers: {transformers.__version__}")

Package versions:
Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
pandas: 2.2.2
numpy: 2.0.2
transformers: 4.55.4


In [3]:
# Dataset configuration and constants
REPO_ID = "McAuley-Lab/Amazon-Reviews-2023"

# Path to parquet file in Google Drive
# NOTE: (EDIT TO YOUR LOCAL DRIVE CONTAINING FILE)
e_reviews_dataset = "/content/drive/MyDrive/Cadence2B/amazon_reviews_2023/raw_reviews_electronics.parquet"
balanced_e_sentiment_reviews = "/content/drive/MyDrive/Cadence2B/amazon_reviews_2023/balanced_reviews_with_sentiment_labels.parquet"
stratified_electronics_metadata = "/content/drive/MyDrive/Cadence2B/amazon_reviews_2023/stratified_electronics_metadata.parquet"

!ls /content/drive/MyDrive/Cadence2B/amazon_reviews_2023/

# seed for reproducible results
SEED = 42

pd.set_option('display.max_colwidth', 180)

balanced_reviews.parquet
balanced_reviews_with_sentiment_labels.parquet
filtered_reviews_final.parquet
stratified_electronics_metadata.parquet


In [4]:
# TEMPORARY (REMOVE)
balanced_e_reviews = "/content/drive/MyDrive/Cadence2B/amazon_reviews_2023/balanced_reviews.parquet"
filtered_e_reviews = "/content/drive/MyDrive/Cadence2B/amazon_reviews_2023/filtered_reviews_final.parquet"

## Data Loading

This code implements a data loading pipeline that loads the Amazon-Reviews-2023 dataset from Hugging Face's. It defines functions that downloads and retrieves user review files and item metadata files then converts them into pandas dataframes for analysis.

In [5]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [6]:
#Download and load any parquet files manually
def load_parquet_dataset(file):
  '''
  Load any parquet file dataset from Google Drive

  Returns:
    pandas.DataFrame: DataFrame containing product metadata or None if loading fails.
  '''

  print("Loading dataset...")

  try:
      # Load directly from Hugging face parquet files
      dataset = load_dataset(
          "parquet",
          data_files={"train": file}
      )

      # Convert to pandas DataFrame
      df = dataset["train"].to_pandas()
      return df

  except Exception as e:
      print(f"Direct parquet loading failed: {e}")
      return None

In [7]:
#Download and load parquet file from Hugging Face manually
def load_amazon_electronics_metadata():
  '''
  Load Amazon Beauty product metadata which contain product information independent
  of individual users from Hugging Face.

  Returns:
    pandas.DataFrame: DataFrame containing product metadata or None if loading fails.
  '''

  print("Loading Amazon Beauty product metadata...")

  try:
      # Load directly from Hugging face parquet files
      dataset = load_dataset(
          "parquet",
          data_files={
              "train": "hf://datasets/McAuley-Lab/Amazon-Reviews-2023/raw_meta_Electronics/*.parquet"
          }
      )

      # Convert to pandas DataFrame
      df = dataset["train"].to_pandas()
      return df

  except Exception as e:
      print(f"Direct parquet loading failed: {e}")
      return None

In [None]:
# Load stratified dataset of Electronics user reviews
# with balanced ratings and sentiment labels
balanced_e_sentiment_reviews_dataset = load_parquet_dataset(balanced_e_sentiment_reviews)
print(balanced_e_sentiment_reviews_dataset.shape)
balanced_e_sentiment_reviews_dataset['rating'].value_counts(normalize=True)

Loading dataset...
(1068800, 11)


Unnamed: 0_level_0,proportion
rating,Unnamed: 1_level_1
5.0,0.2
1.0,0.2
2.0,0.2
3.0,0.2
4.0,0.2


In [None]:
# Load stratified dataset of Electronics user reviews
# with balanced ratings and sentiment labels
stratified_electronics_metadata_dataset = load_parquet_dataset(stratified_electronics_metadata)
print(stratified_electronics_metadata_dataset.shape)

Loading dataset...
(15028, 16)


In [None]:
# Save Electronics metadata DataFrame
e_metadata_df = load_amazon_electronics_metadata()

Loading Amazon Beauty product metadata...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


raw_meta_Electronics/full-00000-of-00010(…):   0%|          | 0.00/220M [00:00<?, ?B/s]

raw_meta_Electronics/full-00001-of-00010(…):   0%|          | 0.00/214M [00:00<?, ?B/s]

raw_meta_Electronics/full-00002-of-00010(…):   0%|          | 0.00/210M [00:00<?, ?B/s]

raw_meta_Electronics/full-00003-of-00010(…):   0%|          | 0.00/207M [00:00<?, ?B/s]

raw_meta_Electronics/full-00004-of-00010(…):   0%|          | 0.00/202M [00:00<?, ?B/s]

raw_meta_Electronics/full-00005-of-00010(…):   0%|          | 0.00/193M [00:00<?, ?B/s]

raw_meta_Electronics/full-00006-of-00010(…):   0%|          | 0.00/192M [00:00<?, ?B/s]

raw_meta_Electronics/full-00007-of-00010(…):   0%|          | 0.00/183M [00:00<?, ?B/s]

raw_meta_Electronics/full-00008-of-00010(…):   0%|          | 0.00/170M [00:00<?, ?B/s]

raw_meta_Electronics/full-00009-of-00010(…):   0%|          | 0.00/164M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Inspect Metadata DataFrame

In [None]:
e_metadata_df.head(10)

NameError: name 'e_metadata_df' is not defined

In [8]:
# Loop over DataFrame in batches
def inspect_categories(df):
  '''
  Gives breakdown of category percentages within DataFrame

  Parameters:
    df: pandas.DataFrame

  Returns:
    None
  '''
  batch_size = 50_000

  category_counter = Counter()
  num_rows = len(df)

  # Iterate through each row in the batch
  for start in range(0, num_rows, batch_size):
      end = min(start + batch_size, num_rows)
      batch = df.iloc[start:end]

      # Update counter with main_category
      category_counter.update(batch['main_category'])

      print(f"Processed rows {start} to {end}")

  # View distribution
  total = sum(category_counter.values())

  print("\nCategory Distrubution (percent):")
  for category, count in category_counter.most_common():
    print(f"{category}: {count} rows, {count/total:.2%}")

In [9]:
def validate_data_quality(df: pd.DataFrame, dataset_name: str) -> None:
    '''
    Validate data quality and print summary statistics.
    '''
    print(f"\n{dataset_name}Data Quality Report")
    print(f"Shape: {df.shape}")
    print(f"Missing values: {df.isnull().sum().sum()}")
    print(f"Duplicate rows: {df.duplicated().sum()}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
inspect_categories(e_metadata_df)

Processed rows 0 to 50000
Processed rows 50000 to 100000
Processed rows 100000 to 150000
Processed rows 150000 to 200000
Processed rows 200000 to 250000
Processed rows 250000 to 300000
Processed rows 300000 to 350000
Processed rows 350000 to 400000
Processed rows 400000 to 450000
Processed rows 450000 to 500000
Processed rows 500000 to 550000
Processed rows 550000 to 600000
Processed rows 600000 to 650000
Processed rows 650000 to 700000
Processed rows 700000 to 750000
Processed rows 750000 to 800000
Processed rows 800000 to 850000
Processed rows 850000 to 900000
Processed rows 900000 to 950000
Processed rows 950000 to 1000000
Processed rows 1000000 to 1050000
Processed rows 1050000 to 1100000
Processed rows 1100000 to 1150000
Processed rows 1150000 to 1200000
Processed rows 1200000 to 1250000
Processed rows 1250000 to 1300000
Processed rows 1300000 to 1350000
Processed rows 1350000 to 1400000
Processed rows 1400000 to 1450000
Processed rows 1450000 to 1500000
Processed rows 1500000 to 

## Stratified Sampling
Randomly sampling rows per category within the Amazon Electronics Metadata dataset, keeping the proportions of each category. Rare categories are not excluded through the min_per_category parameter.

In [None]:
def stratified_sample(df, category_sel='main_category', sample_frac=0.01, min_per_category=1, random_state=SEED, output_file=None):
  '''
  Create stratified sample maintaining category proportions.

  Parameters:
    df: Inputer metadata DataFrame
    category_sel: Column to sample from
    sample_frac: Fraction of data to sample
    min_per_category: Minimum # of records per sample category
    output_file: (optional) Path to save final stratified sample as a parquet file

  Returns:
    pandas.DataFrame: Stratified sample of the input DataFrame
  '''
  # Identify approximate sample size per category
  category_counts = df[category_sel].value_counts()
  total_rows = len(df)
  sample_counts = (category_counts * sample_frac).astype(int)

  # Ensure every category has min_per_category rows
  sample_counts = sample_counts.apply(lambda x: max(x, min_per_category))

  # Accumalate sampled rows for each category
  sampled_dfs = []

  for category, sample in sample_counts.items():
    category_df = df[df[category_sel] == category]

    # Take all rows if category is less than or equal target sample size
    if len(category_df) <= sample:
      sampled_dfs.append(category_df)
    # Else, randomly sample target # of rows
    else:
      sampled_dfs.append(category_df.sample(n=sample, random_state=random_state))

  # Concatenate all sampled category DataFrames into one
  stratified_sample = pd.concat(sampled_dfs, ignore_index=True)
  # Randomize the final sample
  stratified_sample = stratified_sample.sample(frac=1.0, random_state=random_state).reset_index(drop=True)

  if output_file:
    stratified_sample.to_parquet(output_file, index=False)
    print(f"Stratified sample saved to {output_file}")

  return stratified_sample



In [None]:
# Stratified sample
sampled_reviews = stratified_sample(e_metadata_df, sample_frac=0.01, output_file="stratified_electronics_metadata.parquet")

Stratified sample saved to stratified_electronics_metadata.parquet


In [None]:
sampled_reviews.head(10)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Home Audio & Theater,A Pair Soft Earpads Replacement Ear Pads Cushion for Sony MDR-XB950BT/B Extra Bass Bluetooth Wireless Headphones,3.8,3,[],[],,"{'hi_res': [None, 'https://m.media-amazon.com/images/I/515bA--eq9L._AC_SL1024_.jpg', 'https://m.media-amazon.com/images/I/61WC7CxcJgL._AC_SL1024_.jpg', 'https://m.media-amazon....","{'title': [], 'url': [], 'user_id': []}",GZF,[],"{""Package Dimensions"": ""5 x 1.5 x 0.4 inches"", ""Item Weight"": ""1.15 pounds"", ""Manufacturer"": ""GZF"", ""Is Discontinued By Manufacturer"": ""No"", ""Date First Available"": ""November 1...",B01LS1RRYQ,,,
1,All Electronics,"Replacement Remote Control for SANYO DP42848, DP37649, DP46848, DP26648, DP19648, GXBJ",5.0,2,"[Redi-Remotes cannot be programmed to control any auxillary devices. They will do every function for the specific unit they are designed to control., Remote measures 7 1/2"" x 2...",[This is a custom built replacement remote made by Redi Remote for the SANYO remote control number GXBJ. *This is NOT an original remote control. It is a custom replacement rem...,45.99,"{'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/41KTiyljVGL._AC_.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/41KTiyljVGL._AC_US40_.jpg'], 'variant':...","{'title': [], 'url': [], 'user_id': []}",REDI REMOTE,"[Electronics, Television & Video, Accessories, Remote Controls]","{""Product Dimensions"": ""7 x 2.5 x 1 inches"", ""Item Weight"": ""4.5 ounces"", ""Item model number"": ""RTGXBJ"", ""Batteries"": ""4 AAA batteries required. (included)"", ""Date First Availa...",B009D5M8FI,,,
2,Computers,"TRIPLTEK Tablet 7"" PRO 8GB RAM High Brightness 1200 nits, 4G LTE Unlocked, 8 Core Processor 128GB, Android 9, Long Battery Life 10000mah, Rugged Military Construction, Brightes...",4.3,148,"[8GB RAM 128GB ROM, 1200 Nits display, 4G LTE Unlocked, Android 9, Rugged Tablet great for drones]",[],,"{'hi_res': ['https://m.media-amazon.com/images/I/71hnnOHAITL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/71QKmBBXNOL._AC_SL1500_.jpg', 'https://m.media-amazon.com/im...","{'title': [], 'url': [], 'user_id': []}",TRIPLTEK,"[Electronics, Computers & Accessories, Computers & Tablets, Tablets]","{""Standing screen display size"": ""7 Inches"", ""Screen Resolution"": ""1280 x 720 pixels"", ""Processor"": ""2 GHz cortex"", ""RAM"": ""8 GB"", ""Hard Drive"": ""128 GB"", ""Wireless Type"": ""5.8...",B08R871DBM,,,
3,All Electronics,3 DISC SET LASER DISC Deluxe CAV Letter-Box Edition- Twenty-Fifth Anniversary- 2001: A Space Odyssey,3.0,1,"[COLOR 3 DISC BOX SET- CHAPTER SEARCH, STEREO, CONTAINS ORIGINAL THEATRICAL TRAILER & 1 2001 FEATURETTE, RUNNING TIME 2 HRS 19 MINS., STANDARD PLAY]",[DELUXE CAV LETTER-BOX EDITION. TWENTY-FIFTH ANNIVERSARY M-G-M/UA HOME VIDEO CONTAINS ORIGINAL THEATRICAL TRAILER & 1 2001 FEATURETTE DIGITALLY MASTERED DIRECTLY FROM MGM ARCHI...,,"{'hi_res': ['https://m.media-amazon.com/images/I/91JVgCJakLL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/91OVwvU-UFL._AC_SL1500_.jpg'], 'large': ['https://m.media-am...","{'title': [], 'url': [], 'user_id': []}",3 DISC SET LASER DISC,[],"{""Product Dimensions"": ""0.01 x 0.01 x 0.01 inches"", ""Item Weight"": ""0.16 ounces"", ""Is Discontinued By Manufacturer"": ""No"", ""Date First Available"": ""November 29, 2007"", ""Manufac...",B00108E1GU,,,
4,Camera & Photo,Canon C250 AV Cable (RCA to Miniplug),3.0,9,[Connect mono audio and video to TV or VCR],"[From the Manufacturer, Connect mono audio and video to TV or VCR (with this mini plug to RCA cable).]",,"{'hi_res': [None, None, None], 'large': ['https://m.media-amazon.com/images/I/51DIRBl7CkL._AC_.jpg', 'https://m.media-amazon.com/images/I/51zD41lLJcL._AC_.jpg', 'https://m.medi...","{'title': [], 'url': [], 'user_id': []}",Canon,"[Electronics, Television & Video, Accessories, Cables, RCA]","{""Brand"": ""Canon"", ""Connector Type"": ""RCA"", ""Cable Type"": ""Composite"", ""Compatible Devices"": ""Television"", ""Color"": ""Black"", ""Connector Gender"": ""Male-to-Male"", ""Shape"": ""Round...",B00009V4H4,,,
5,Computers,AXXESS AX-NISUSB-2 - USB-Auxilliary Retention Harness - NISSAN 4 PIN USB ADAPTOR,1.0,2,[],[],,"{'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/31Dst9kHrBL._AC_.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/31Dst9kHrBL._AC_US40_.jpg'], 'variant':...","{'title': [], 'url': [], 'user_id': []}",AXXESS,"[Electronics, Computers & Accessories, Computer Accessories & Peripherals, Cables & Accessories, Cables & Interconnects, USB Cables]","{""Item Weight"": ""2 pounds"", ""Manufacturer"": ""AXXESS"", ""Date First Available"": ""October 4, 2013""}",B00FMSBURM,,,
6,All Electronics,Comet Original CHA-250B HF/50 MHz (3.5~57) Broadband Ground-Plane Vertical Base Antenna,4.2,61,"[Multi Band HF Vertical Antenna For 6 Meter Through 80 Meters HF., Requires No Radials, No Tuning or Adjustments!., SWR of 1.61 or Lower on All Bands!, Fits Optional Mast Pipes...","[Multi Band HF Vertical Amateur Ham Radio Antenna For 6 Meter Through 80 Meters HF,Requires No Radials, No Tuning or Adjustments!. SWR of 1.61 or Lower on All Bands!Fits Option...",489.95,"{'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/21T6CGIfWCL._AC_.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/21T6CGIfWCL._AC_US40_.jpg'], 'variant':...","{'title': ['Collapsible and Adjustable HF Antenna: GRA-7350TC'], 'url': ['https://www.amazon.com/vdp/0b34a39f116d4d2283bd014a40826446?ref=dp_vse_rvc_0'], 'user_id': ['']}",Recomfit,"[Electronics, Headphones, Earbuds & Accessories, Adapters]","{""Product Dimensions"": ""23.5 x 2 x 2 inches"", ""Item Weight"": ""7 pounds"", ""Item model number"": ""CHA-250B"", ""Best Sellers Rank"": {""Electronics"": 357346, ""Headphone Adapters"": 326...",B00193FHH8,,,
7,Automotive,Beck/Arnley 203-0291 Cooling Fan Control Module,2.3,4,"[Beck/Arnley parts meet foreign nameplate OE specifications for form, fit and function. Our product specialists work with a network of global sourcing partners so you can insta...","[The Beck/Arnley cooling fan control module matches OE form, fit and function. It is constructed for durability and made with high quality materials and componentry to withstan...",143.99,"{'hi_res': ['https://m.media-amazon.com/images/I/71ndBAVVFWL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/718YQdCqznL._AC_SL1500_.jpg', 'https://m.media-amazon.com/im...","{'title': [], 'url': [], 'user_id': []}",Beck/Arnley,"[Electronics, Computers & Accessories, Computer Components, Internal Components, Fans & Cooling, Case Fans]","{""Manufacturer"": ""Beck/Arnley"", ""Brand"": ""Beck/Arnley"", ""Item Weight"": ""14.4 ounces"", ""Package Dimensions"": ""7.05 x 4.37 x 2.01 inches"", ""Country of Origin"": ""Taiwan"", ""Item mo...",B07T5FWT9R,,,
8,Camera & Photo,Polarizit B76 Dashboard Camera Glare Flash Reflection Remover for BlackVue CPL Polarized Lens Filter for Models DR750S DR650S DR650GW 1CH 2CH 16GB 32GB 64GB Dashcam Dash Camera...,3.8,16,[],[],,"{'hi_res': ['https://m.media-amazon.com/images/I/71NjlDb7OWL._AC_SL1500_.jpg', None, None, None, None, None, 'https://m.media-amazon.com/images/I/61tRL9kjy8L._AC_SL1200_.jpg'],...","{'title': ['Polarizit CPL Lens Filter for BlackVue DR900S DR750S DR650S DR550'], 'url': ['https://www.amazon.com/vdp/0455a1c547b644bd87fd0541c4e9a4c8?ref=dp_vse_rvc_0'], 'user_...",7DAYCAM,"[Electronics, Camera & Photo, Accessories, Filters & Accessories]","{""Product Dimensions"": ""1 x 1 x 1 inches"", ""Item Weight"": ""2.39 ounces"", ""Is Discontinued By Manufacturer"": ""No"", ""Date First Available"": ""August 20, 2019"", ""Manufacturer"": ""7D...",B07GPLH28W,,,
9,Camera & Photo,HQRP Battery for Welch Allyn 72200 3.6V Ni-Cad fits 71000 71010 71015 71020 71022 71051 71054 71055 71670 71050 Handle 78904586 + HQRP Coaster,3.4,10,[],[#A2_25DEC12_BUMJ+PAO Compatible with: Welch Allyn 72200 3.5V Ni-Cad fits 71000 71010 71015 71020 71022 71051 71054 71055 71670 71050 Handle 78904586 Compatible with: Welch All...,,"{'hi_res': [None, None, None], 'large': ['https://m.media-amazon.com/images/I/516eq1R6ntL._AC_.jpg', 'https://m.media-amazon.com/images/I/41JtchEf2EL._AC_.jpg', 'https://m.medi...","{'title': [], 'url': [], 'user_id': []}",HQRP,"[Electronics, Camera & Photo, Accessories, Batteries & Chargers, Batteries, Camera Batteries]","{""Item model number"": ""88777412251202"", ""Is Discontinued By Manufacturer"": ""No"", ""Date First Available"": ""August 12, 2022"", ""Manufacturer"": ""HQRP"", ""Brand"": ""HQRP"", ""Battery Ce...",B00B5R80DG,,,


## Sample Category Distribution

In [None]:
print(sampled_reviews.shape)
inspect_categories(sampled_reviews)

(15028, 16)
Processed rows 0 to 15028

Category Distrubution (percent):
Computers: 4188 rows, 27.87%
All Electronics: 3764 rows, 25.05%
Camera & Photo: 2236 rows, 14.88%
Cell Phones & Accessories: 1382 rows, 9.20%
Home Audio & Theater: 1065 rows, 7.09%
Industrial & Scientific: 505 rows, 3.36%
Car Electronics: 273 rows, 1.82%
Tools & Home Improvement: 240 rows, 1.60%
Office Products: 212 rows, 1.41%
Amazon Home: 212 rows, 1.41%
AMAZON FASHION: 183 rows, 1.22%
Sports & Outdoors: 145 rows, 0.96%
Automotive: 139 rows, 0.92%
GPS & Navigation: 93 rows, 0.62%
Amazon Devices: 84 rows, 0.56%
Portable Audio & Accessories: 80 rows, 0.53%
Musical Instruments: 66 rows, 0.44%
Toys & Games: 39 rows, 0.26%
Health & Personal Care: 30 rows, 0.20%
All Beauty: 18 rows, 0.12%
Arts, Crafts & Sewing: 14 rows, 0.09%
Apple Products: 10 rows, 0.07%
Video Games: 10 rows, 0.07%
Books: 9 rows, 0.06%
Baby: 7 rows, 0.05%
Pet Supplies: 4 rows, 0.03%
Software: 3 rows, 0.02%
Amazon Fire TV: 2 rows, 0.01%
Appliances: 2 

## Stratified Sample - Electronic User Reviews

In [None]:
def ultra_memory_safe_filter(parquet_path: str, target_asins: Set[str],
                           output_path: str = "filtered_reviews.parquet",
                           batch_size: int = 50000):
    '''
    Reads large Parquet files in batches, filters reviews for target ASINs
    and writes to disk immediatley, never accumulating in RAM.

    Parameters:
        parquet_path: Path to the large parquet file
        target_asins: Set of ASINs to match
        output_path: Where to save filtered results
        batch_size: Rows per batch
    '''

    print(f"Memory-safe filtering for {len(target_asins):,} target ASINs")
    print(f"Batch size: {batch_size:,} rows")
    print(f"Results will be written to: {output_path}")

    # Delete output file if it exists
    if os.path.exists(output_path):
        os.remove(output_path)

    total_matches = 0
    first_write = True

    # Load parquet file without loading entire dataset into memory
    try:
        parquet_file = pq.ParquetFile(parquet_path)
        total_rows = parquet_file.metadata.num_rows
        num_batches = (total_rows + batch_size - 1) // batch_size

        print(f"Total rows: {total_rows:,} in {num_batches} batches\n")

        for batch_num, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size)):
            print(f"Batch {batch_num + 1}/{num_batches}...", end=" ")

            # Convert to pandas
            batch_df = batch.to_pandas()

            # Filter matches
            matches = batch_df[batch_df['parent_asin'].isin(target_asins)]

            if len(matches) > 0:
                # Write directly to disk, create file in first batch
                if first_write:
                    matches.to_parquet(output_path, index=False)
                    first_write = False
                else:
                    # Append to existing file
                    existing = pd.read_parquet(output_path)
                    combined = pd.concat([existing, matches], ignore_index=True)
                    combined.to_parquet(output_path, index=False)
                    del existing, combined

                total_matches += len(matches)
                print(f"Found {len(matches):,} (Total: {total_matches:,})")
            else:
                print("No matches")

            # Delete everything IMMEDIATLEY to free memory
            del batch_df, matches, batch
            gc.collect()

            # Run memory consumption check every 5 batches
            if (batch_num + 1) % 5 == 0:
                memory_mb = psutil.Process().memory_info().rss / 1024 / 1024
                print(f"  RAM: {memory_mb:.1f} MB")

    except Exception as e:
        print(f"Error: {e}")
        if os.path.exists(output_path):
            os.remove(output_path)
        raise

    if total_matches > 0:
        print(f"\nSuccess! {total_matches:,} matches saved to {output_path}")
        return output_path
    else:
        print("No matches found")
        return None

In [None]:
def e_load_stratified_reviews(parquet_path: str, target_asins: Set[str], batch_size: int = 25000):
    '''
    Load only reviews for specific ASINs to avoid memory issues

    Parameters:
      parquet_path: File path to the Parquet reviews
      target_asins: Set of ASINs to filter for
      batch_size: Number of rows to read per batch to save memory

    '''
    print(f"Memory conserving loading...")
    print(f"Batch size: {batch_size:,} (very small to avoid RAM crashes)")

    # Save matches to multiple small files, then combine later
    temp_dir = "temp_matches"
    os.makedirs(temp_dir, exist_ok=True)

    total_matches = 0
    file_counter = 0

    # Loop over Parquet file in batches
    try:
        parquet_file = pq.ParquetFile(parquet_path)

        for batch_num, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size)):
            print(f"Batch {batch_num + 1}...", end=" ")

            # Filter for matched parent ASINs
            batch_df = batch.to_pandas()
            matches = batch_df[batch_df['parent_asin'].isin(target_asins)]

            if len(matches) > 0:
                # Save each batch of matches to separate temporary file
                temp_file = f"{temp_dir}/matches_{file_counter:04d}.parquet"
                matches.to_parquet(temp_file, index=False)
                file_counter += 1
                total_matches += len(matches)
                print(f"Saved {len(matches):,} to {temp_file}")
            else:
                print("No matches")

            # Delete batch from memory IMMEDIATELY
            del batch_df, matches, batch
            gc.collect()

    except Exception as e:
        print(f"Error: {e}")

        # Cleanup temp files
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        raise

    # Combine temp files into final result
    if file_counter > 0:
        print(f"\nCombining {file_counter} temp files...")
        temp_files = [f"{temp_dir}/matches_{i:04d}.parquet" for i in range(file_counter)]

        # Read and combine in small chunks
        all_dfs = []
        for temp_file in temp_files:
            df = pd.read_parquet(temp_file)
            all_dfs.append(df)

        # Concatenate smaller DataFrames into one final DataFrame
        final_result = pd.concat(all_dfs, ignore_index=True)
        # Save to disk
        final_result.to_parquet("filtered_reviews_final.parquet", index=False)

        # Cleanup
        shutil.rmtree(temp_dir)

        print(f"Final result: {len(final_result):,} reviews in filtered_reviews_final.parquet")
        return "filtered_reviews_final.parquet"

    return None

In [None]:
def run_safest_filter():
    '''
    Run most memory efficient stratification algorithm automatically

    '''

    # Get target ASINs from sample_reviews
    target_asins = set(sampled_reviews['parent_asin'].unique())
    parquet_path = e_reviews_dataset

    print(f"Target ASINs: {len(target_asins):,}")

    try:
        # Run batch loading and processing function with small batche size
        result_path = e_load_stratified_reviews(
            parquet_path=parquet_path,
            target_asins=target_asins,
            batch_size=25000   # Small batch size
        )

        if result_path:
            # Load the final result
            filtered_reviews = pd.read_parquet(result_path)
            print(f"Successfully filtered {len(filtered_reviews):,} reviews!")
            return filtered_reviews

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

In [None]:
filtered_reviews = run_safest_filter()

Target ASINs: 15,028
Memory conserving loading...
Batch size: 25,000 (very small to avoid RAM crashes)
Batch 1... Saved 240 to temp_matches/matches_0000.parquet
Batch 2... Saved 220 to temp_matches/matches_0001.parquet
Batch 3... Saved 247 to temp_matches/matches_0002.parquet
Batch 4... Saved 221 to temp_matches/matches_0003.parquet
Batch 5... Saved 240 to temp_matches/matches_0004.parquet
Batch 6... Saved 238 to temp_matches/matches_0005.parquet
Batch 7... Saved 201 to temp_matches/matches_0006.parquet
Batch 8... Saved 188 to temp_matches/matches_0007.parquet
Batch 9... Saved 238 to temp_matches/matches_0008.parquet
Batch 10... Saved 209 to temp_matches/matches_0009.parquet
Batch 11... Saved 252 to temp_matches/matches_0010.parquet
Batch 12... Saved 225 to temp_matches/matches_0011.parquet
Batch 13... Saved 213 to temp_matches/matches_0012.parquet
Batch 14... Saved 213 to temp_matches/matches_0013.parquet
Batch 15... Saved 206 to temp_matches/matches_0014.parquet
Batch 16... Saved 232

In [None]:
len(filtered_reviews)

385604

In [None]:
!ls -lh temp_matches/

ls: cannot access 'temp_matches/': No such file or directory


# Data Cleaning & Preprocessing

In [None]:
# Clean HTML tags while protecting product descriptions
def clean_html_artifacts(data: Any) -> str:
    """
    Clean HTML from containers and distinguishing from product descriptions.

    Parameters:
      data: Input data of any type (string, list, dict, etc.)

    Returns:
      str: Cleaned text with only HTML tags removed and surrounding text preserved
    """

    # Return emoty string if data is none, not a list/dict
    if data is None:
        return ""
    if isinstance(data, float) and pd.isna(data):
        return ""

    # If its a string clean it using the helper function
    if isinstance(data, str):
        return _smart_clean_text(data)

    # If its a list process each element and join
    if isinstance(data, list):
        cleaned_items = []
        # Iterate through each list item
        for item in data:

          # Skip object if its NaN or None
            if pd.isna(item) or item is None:
                continue
            cleaned_item = _smart_clean_text(str(item))

            # Append everything into a space separated string
            if cleaned_item.strip():
                cleaned_items.append(cleaned_item)

        return " ".join(cleaned_items)

    # For any other data type convert to string and clean
    return _smart_clean_text(str(data))


In [None]:
def _smart_clean_text(text: str) -> str:
    """
    Clean HTML tags while keeping product descriptions.

    Parameters:
      text: Input text to clean

    Returns:
      text: Text with only HTML tags removed, product descriptions preserved
    """
    # If input isn't string or is empty string, return
    if not isinstance(text, str) or not text.strip():
        return ""

    # Remove HTML comments using regez
    text = re.sub(r'<!--.*?-->', ' ', text, flags=re.DOTALL)

    # Find real HTML tags that should be removed
    # Created regex patterns that match HTML tags
    HTML_REMOVE_TAGS = [
        # Standard HTML tags (opening and closing)
        # Bold
        r'<\s*/?\s*b\s*/?>\s*',
        # Italic
        r'<\s*/?\s*i\s*/?>\s*',
        # Underline
        r'<\s*/?\s*u\s*/?>\s*',
        # Strong
        r'<\s*/?\s*strong\s*/?>\s*',
        # Emphasis
        r'<\s*/?\s*em\s*/?>\s*',
        # Paragraph
        r'<\s*/?\s*p\s*/?>\s*',
        # Div
        r'<\s*/?\s*div[^>]*>\s*',
        # Span
        r'<\s*/?\s*span[^>]*>\s*',
        # Line break
        r'<\s*/?\s*br\s*/?>\s*',
        # Horizontal rule
        r'<\s*/?\s*hr\s*/?>\s*',
        # Unordered list
        r'<\s*/?\s*ul\s*/?>\s*',
        # Ordered list
        r'<\s*/?\s*ol\s*/?>\s*',
        # List item
        r'<\s*/?\s*li\s*/?>\s*',
        # Headers
        r'<\s*/?\s*h[1-6]\s*/?>\s*',
        # Links
        r'<\s*/?\s*a[^>]*>\s*',
        # Images
        r'<\s*/?\s*img[^>]*>\s*',
        # Tables
        r'<\s*/?\s*table[^>]*>\s*',
        # Table rows
        r'<\s*/?\s*tr[^>]*>\s*',
        # Table cells
        r'<\s*/?\s*td[^>]*>\s*',
        # Table headers
        r'<\s*/?\s*th[^>]*>\s*',
        # Font tags
        r'<\s*font[^>]*>\s*',
        # Closing font tags
        r'<\s*/?\s*font\s*>\s*',
        # Superscript
        r'<\s*/?\s*sup\s*/?>\s*',
        # Subscript
        r'<\s*/?\s*sub\s*/?>\s*',
    ]

    # Loop through regex patterns and apply HTML tag removal
    for pattern in HTML_REMOVE_TAGS:
        text = re.sub(pattern, ' ', text, flags=re.IGNORECASE)

    # Remove meaningless HTML-like tags
    noise_patterns = [
        # Arrows
        r'<-+>',
        # Equals
        r'<=+>',
        # Empty angle brackets
        r'<\s*>\s*',
    ]

    for pattern in noise_patterns:
        text = re.sub(pattern, ' ', text, flags=re.IGNORECASE)

    # Remove HTML entities
    html_entities = {
        '&nbsp;': ' ',
        '&amp;': '&',
        '&lt;': '<',
        '&gt;': '>',
        '&quot;': '"',
        '&#39;': "'",
        '&apos;': "'",
    }

    for entity, replacement in html_entities.items():
        text = text.replace(entity, replacement)

    # Remove whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

In [None]:
def _is_html_tag(text: str) -> bool:
    """
    Identify whether a string in angle brackets is an actual HTML tag or necessary product information.

    Parameters:
      text: Text to analyze

    Returns:
      bool: True if it's an HTML tag, False if it's product description
    """
    # Ignore non-HTML tag-like strings
    if not text.startswith('<') or not text.endswith('>'):
        return False

    # Remove the angle brackets
    inner_content = text[1:-1].strip()

    # Check for closing tags
    if inner_content.startswith('/'):
        inner_content = inner_content[1:].strip()

    # List of all known HTML tag names
    html_tag_names = {
        'a', 'b', 'br', 'div', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
        'hr', 'i', 'img', 'li', 'ol', 'p', 'span', 'strong', 'sub', 'sup',
        'table', 'td', 'th', 'tr', 'u', 'ul'
    }

    # Extract the tag name
    tag_name = inner_content.split()[0].lower() if inner_content else ""

    # Remove any attributes or special characters
    tag_name = re.sub(r'[^a-zA-Z]', '', tag_name)

    return tag_name in html_tag_names



## Clean Stratified User Reviews Sample

In [None]:
text_columns_u_r = filtered_reviews.select_dtypes(include='object').columns.tolist()

In [None]:
for col in text_columns_u_r:
  filtered_reviews[f"{col}_clean"] = filtered_reviews[col].apply(clean_html_artifacts)

In [None]:
filtered_reviews.head(5)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,title_clean,text_clean,images_clean,asin_clean,parent_asin_clean,user_id_clean
0,4.0,assemble as instructed,this was a gift / my friend says he liked it / it was what he thought it was / he likes the idea of open computer . . .,[],B07547WSQQ,B07547WSQQ,AEFKF6R2GUSK2AWPSWRR4ZO36JVQ,1571208521372,0,True,assemble as instructed,this was a gift / my friend says he liked it / it was what he thought it was / he likes the idea of open computer . . .,[],B07547WSQQ,B07547WSQQ,AEFKF6R2GUSK2AWPSWRR4ZO36JVQ
1,3.0,"Not what I was expecting - Ok, but overpriced","First the disclaimer that I got this in the Vine program. That being said, I always try to be fair, honest and objective in my reviews. That being said, I am a little disappoin...",[],B004TPJKEY,B004TPJKEY,AFSKPY37N3C43SOI5IEXEK5JSIYA,1329832562000,0,False,"Not what I was expecting - Ok, but overpriced","First the disclaimer that I got this in the Vine program. That being said, I always try to be fair, honest and objective in my reviews. That being said, I am a little disappoin...",[],B004TPJKEY,B004TPJKEY,AFSKPY37N3C43SOI5IEXEK5JSIYA
2,5.0,Maple leaf attache,This is a beautiful item. I get lots of compliments. It's functional and everything I wanted.,[],B06XD94B2F,B06XD94B2F,AHPUT3ITXCHQJO7OMF74LEMYHIVA,1540745403758,0,True,Maple leaf attache,This is a beautiful item. I get lots of compliments. It's functional and everything I wanted.,[],B06XD94B2F,B06XD94B2F,AHPUT3ITXCHQJO7OMF74LEMYHIVA
3,5.0,What a beautiful TV. The price was fair,"What a beautiful TV. The price was fair. I original ordered a smaller model, but since it was backordered, the seller offered me this model at the same price as the less expe...",[],B0036WT3ZM,B0036WT3ZM,AHZ6XMOLEWA67S3TX7IWEXXGWSOA,1306350894000,0,True,What a beautiful TV. The price was fair,"What a beautiful TV. The price was fair. I original ordered a smaller model, but since it was backordered, the seller offered me this model at the same price as the less expens...",[],B0036WT3ZM,B0036WT3ZM,AHZ6XMOLEWA67S3TX7IWEXXGWSOA
4,5.0,travel cable organizer,"Husband travels 300 days a year. If you travel at all, you can relate to the shear volume of cables you have to carry with you. He LOVES this organizer because everything is ...",[],B01NCVI872,B09CD66HJM,AFZUK3MTBIBEDQOPAK3OATUOUKLA,1494374095000,20,True,travel cable organizer,"Husband travels 300 days a year. If you travel at all, you can relate to the shear volume of cables you have to carry with you. He LOVES this organizer because everything is co...",[],B01NCVI872,B09CD66HJM,AFZUK3MTBIBEDQOPAK3OATUOUKLA


In [None]:
text_columns_metadata = sampled_reviews.select_dtypes(include='object').columns.tolist()

In [None]:
for col in text_columns_metadata:
  sampled_reviews[f"{col}_clean"] = sampled_reviews[col].apply(clean_html_artifacts)

In [None]:
sampled_reviews.query('subtitle_clean.notna()')

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,...,price_clean,images_clean,videos_clean,store_clean,categories_clean,details_clean,parent_asin_clean,bought_together_clean,subtitle_clean,author_clean
0,Home Audio & Theater,A Pair Soft Earpads Replacement Ear Pads Cushion for Sony MDR-XB950BT/B Extra Bass Bluetooth Wireless Headphones,3.8,3,[],[],,"{'hi_res': [None, 'https://m.media-amazon.com/images/I/515bA--eq9L._AC_SL1024_.jpg', 'https://m.media-amazon.com/images/I/61WC7CxcJgL._AC_SL1024_.jpg', 'https://m.media-amazon....","{'title': [], 'url': [], 'user_id': []}",GZF,...,,"{'hi_res': array([None, 'https://m.media-amazon.com/images/I/515bA--eq9L._AC_SL1024_.jpg', 'https://m.media-amazon.com/images/I/61WC7CxcJgL._AC_SL1024_.jpg', 'https://m.media-a...","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",GZF,[],"{""Package Dimensions"": ""5 x 1.5 x 0.4 inches"", ""Item Weight"": ""1.15 pounds"", ""Manufacturer"": ""GZF"", ""Is Discontinued By Manufacturer"": ""No"", ""Date First Available"": ""November 1...",B01LS1RRYQ,,,
1,All Electronics,"Replacement Remote Control for SANYO DP42848, DP37649, DP46848, DP26648, DP19648, GXBJ",5.0,2,"[Redi-Remotes cannot be programmed to control any auxillary devices. They will do every function for the specific unit they are designed to control., Remote measures 7 1/2"" x 2...",[This is a custom built replacement remote made by Redi Remote for the SANYO remote control number GXBJ. *This is NOT an original remote control. It is a custom replacement rem...,45.99,"{'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/41KTiyljVGL._AC_.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/41KTiyljVGL._AC_US40_.jpg'], 'variant':...","{'title': [], 'url': [], 'user_id': []}",REDI REMOTE,...,45.99,"{'hi_res': array([None], dtype=object), 'large': array(['https://m.media-amazon.com/images/I/41KTiyljVGL._AC_.jpg'], dtype=object), 'thumb': array(['https://m.media-amazon.com/...","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",REDI REMOTE,['Electronics' 'Television & Video' 'Accessories' 'Remote Controls'],"{""Product Dimensions"": ""7 x 2.5 x 1 inches"", ""Item Weight"": ""4.5 ounces"", ""Item model number"": ""RTGXBJ"", ""Batteries"": ""4 AAA batteries required. (included)"", ""Date First Availa...",B009D5M8FI,,,
2,Computers,"TRIPLTEK Tablet 7"" PRO 8GB RAM High Brightness 1200 nits, 4G LTE Unlocked, 8 Core Processor 128GB, Android 9, Long Battery Life 10000mah, Rugged Military Construction, Brightes...",4.3,148,"[8GB RAM 128GB ROM, 1200 Nits display, 4G LTE Unlocked, Android 9, Rugged Tablet great for drones]",[],,"{'hi_res': ['https://m.media-amazon.com/images/I/71hnnOHAITL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/71QKmBBXNOL._AC_SL1500_.jpg', 'https://m.media-amazon.com/im...","{'title': [], 'url': [], 'user_id': []}",TRIPLTEK,...,,"{'hi_res': array(['https://m.media-amazon.com/images/I/71hnnOHAITL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/71QKmBBXNOL._AC_SL1500_.jpg', 'https://m.media-amazon....","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",TRIPLTEK,['Electronics' 'Computers & Accessories' 'Computers & Tablets' 'Tablets'],"{""Standing screen display size"": ""7 Inches"", ""Screen Resolution"": ""1280 x 720 pixels"", ""Processor"": ""2 GHz cortex"", ""RAM"": ""8 GB"", ""Hard Drive"": ""128 GB"", ""Wireless Type"": ""5.8...",B08R871DBM,,,
3,All Electronics,3 DISC SET LASER DISC Deluxe CAV Letter-Box Edition- Twenty-Fifth Anniversary- 2001: A Space Odyssey,3.0,1,"[COLOR 3 DISC BOX SET- CHAPTER SEARCH, STEREO, CONTAINS ORIGINAL THEATRICAL TRAILER & 1 2001 FEATURETTE, RUNNING TIME 2 HRS 19 MINS., STANDARD PLAY]",[DELUXE CAV LETTER-BOX EDITION. TWENTY-FIFTH ANNIVERSARY M-G-M/UA HOME VIDEO CONTAINS ORIGINAL THEATRICAL TRAILER & 1 2001 FEATURETTE DIGITALLY MASTERED DIRECTLY FROM MGM ARCHI...,,"{'hi_res': ['https://m.media-amazon.com/images/I/91JVgCJakLL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/91OVwvU-UFL._AC_SL1500_.jpg'], 'large': ['https://m.media-am...","{'title': [], 'url': [], 'user_id': []}",3 DISC SET LASER DISC,...,,"{'hi_res': array(['https://m.media-amazon.com/images/I/91JVgCJakLL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/91OVwvU-UFL._AC_SL1500_.jpg'], dtype=object), 'large':...","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",3 DISC SET LASER DISC,[],"{""Product Dimensions"": ""0.01 x 0.01 x 0.01 inches"", ""Item Weight"": ""0.16 ounces"", ""Is Discontinued By Manufacturer"": ""No"", ""Date First Available"": ""November 29, 2007"", ""Manufac...",B00108E1GU,,,
4,Camera & Photo,Canon C250 AV Cable (RCA to Miniplug),3.0,9,[Connect mono audio and video to TV or VCR],"[From the Manufacturer, Connect mono audio and video to TV or VCR (with this mini plug to RCA cable).]",,"{'hi_res': [None, None, None], 'large': ['https://m.media-amazon.com/images/I/51DIRBl7CkL._AC_.jpg', 'https://m.media-amazon.com/images/I/51zD41lLJcL._AC_.jpg', 'https://m.medi...","{'title': [], 'url': [], 'user_id': []}",Canon,...,,"{'hi_res': array([None, None, None], dtype=object), 'large': array(['https://m.media-amazon.com/images/I/51DIRBl7CkL._AC_.jpg', 'https://m.media-amazon.com/images/I/51zD41lLJcL...","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",Canon,['Electronics' 'Television & Video' 'Accessories' 'Cables' 'RCA'],"{""Brand"": ""Canon"", ""Connector Type"": ""RCA"", ""Cable Type"": ""Composite"", ""Compatible Devices"": ""Television"", ""Color"": ""Black"", ""Connector Gender"": ""Male-to-Male"", ""Shape"": ""Round...",B00009V4H4,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15023,All Electronics,"[Apple MFi Certified] Lightning to HDMI Adapter,1080P Lightning to Digital AV Audio Adapter with Charging Port for iPhone,iPad,iPod, 4K HDMI Sync Screen Converter for HDTV/Moni...",3.7,83,[],[],,"{'hi_res': ['https://m.media-amazon.com/images/I/61kqIjRhizL._AC_SL1500_.jpg', None, None, 'https://m.media-amazon.com/images/I/71+lDpmLEqL._AC_SL1500_.jpg', 'https://m.media-a...","{'title': [], 'url': [], 'user_id': []}",esbeecables,...,,"{'hi_res': array(['https://m.media-amazon.com/images/I/61kqIjRhizL._AC_SL1500_.jpg', None, None, 'https://m.media-amazon.com/images/I/71+lDpmLEqL._AC_SL1500_.jpg', 'https://m.m...","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",esbeecables,['Electronics' 'Computers & Accessories' 'Computer Accessories & Peripherals' 'Computer Cable Adapters' 'DVI-HDMI Adapters'],"{""Package Dimensions"": ""6.42 x 4.25 x 0.35 inches"", ""Item Weight"": ""0.01 Kilograms"", ""Date First Available"": ""October 8, 2020"", ""Manufacturer"": ""esbeecables"", ""Compatible Devic...",B08KWBZ4YM,,,
15024,Tools & Home Improvement,"Foldable LED Desk Lamp, Dimmable Office Lamp with USB Charging Port, Portable Eye-caring Table lamp, 3 Brightness Levels, 360° Flexible Light Hangers, 5V Reading Lamps, Battery...",4.5,318,"[【Ultra Portable design】:ANTIEE use Ultra thin aluminum alloy material with the reinforced shaft, not easily deformed, four-section folding design,360° rotatable LED panel and ...",[],,"{'hi_res': ['https://m.media-amazon.com/images/I/51yb5gB+QYL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/61CZQPH7l5L._AC_SL1500_.jpg', 'https://m.media-amazon.com/im...","{'title': [], 'url': [], 'user_id': []}",ANTIEE,...,,"{'hi_res': array(['https://m.media-amazon.com/images/I/51yb5gB+QYL._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/61CZQPH7l5L._AC_SL1500_.jpg', 'https://m.media-amazon....","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",ANTIEE,['Electronics' 'Computers & Accessories' 'Computer Accessories & Peripherals' 'USB Gadgets' 'USB Lamps'],"{""Style"": ""Modern"", ""Brand"": ""ANTIEE"", ""Color"": ""Multicolor"", ""Product Dimensions"": ""8\""D x 3\""W x 1.6\""H"", ""Special Feature"": ""Adjustable"", ""Light Source Type"": ""LED"", ""Finish...",B07519W6CN,,,
15025,All Electronics,Fully Replacement X240 Laptop Battery Compatible with Lenovo ThinkPad X240 X250 X260 T440 T450 T450S K2450 T460P 45N1126 45N1127 45N1125 45N1128 45N1129 45N1737 45N1133-11.4V 24WH,4.2,14,"[Battery type: Li-ion ; Voltage: 11.4V ; Capacity: 24WH, Compatible with：Lenovo L450 L460 L470 T440s T440 T450 T450s T460 T460P T550, T560, P50S , W550s X240, X250, X260, X270,...",[Battery type: Li-ion Capacity : 24wh Voltage : 11.4V Compatible with ：L450 L460 L470 P50S W550S T440S T440 T450 T450s T460 T460P T550 T560 X240 X250 X260 X270 Compatible Part...,35.98,"{'hi_res': ['https://m.media-amazon.com/images/I/71MOSe5unhS._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/81qq1Cg27GS._AC_SL1500_.jpg', 'https://m.media-amazon.com/im...","{'title': ['GHU High Quality Battery that you can trust', '58WH X240 Battery for Lenovo T440 T450 T460 T550 X250 X260', 'T470 T480 T570 T580 A475 Lenovo external battery Replac...",Fully,...,35.98,"{'hi_res': array(['https://m.media-amazon.com/images/I/71MOSe5unhS._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/81qq1Cg27GS._AC_SL1500_.jpg', 'https://m.media-amazon....","{'title': array(['GHU High Quality Battery that you can trust', '58WH X240 Battery for Lenovo T440 T450 T460 T550 X250 X260', 'T470 T480 T570 T580 A475 Lenovo external battery ...",Fully,['Electronics' 'Computers & Accessories' 'Laptop Accessories' 'Batteries'],"{""Package Dimensions"": ""9.69 x 3.11 x 1.34 inches"", ""Item Weight"": ""5.9 ounces"", ""Item model number"": ""C4K9V"", ""Batteries"": ""3 Nonstandard Battery batteries required. (included...",B07QPMXW7D,,,
15026,Computers,DURAGADGET Soft Pink Neoprene Protective Case w/Dual Zips - Compatible with Sony Vaio VPCEH2P0E | C Series 15.5-inch & E Series Laptops,3.9,4,[],"[Introducing, DURAGADGET’s, new protective neoprene pouch for your expensive gadget, finished in fashionable pink. This stylish water resistant case is durable but remains incr...",,"{'hi_res': ['https://m.media-amazon.com/images/I/91GN-Guzb6L._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/81WSNAdO4OL._AC_SL1500_.jpg', 'https://m.media-amazon.com/im...","{'title': [], 'url': [], 'user_id': []}",DURAGADGET,...,,"{'hi_res': array(['https://m.media-amazon.com/images/I/91GN-Guzb6L._AC_SL1500_.jpg', 'https://m.media-amazon.com/images/I/81WSNAdO4OL._AC_SL1500_.jpg', 'https://m.media-amazon....","{'title': array([], dtype=object), 'url': array([], dtype=object), 'user_id': array([], dtype=object)}",DURAGADGET,"['Electronics' 'Computers & Accessories' 'Laptop Accessories' 'Bags, Cases & Sleeves' 'Sleeves']","{""Standing screen display size"": ""15.5 Inches"", ""Brand"": ""DURAGADGET"", ""Item model number"": ""291"", ""Item Weight"": ""7 ounces"", ""Package Dimensions"": ""16.1 x 12 x 0.7 inches"", ""C...",B005VNJPXE,,,


In [None]:
# #Download and load metadata parquet files manually
# def load_meta_data(sample_size=None):
#   '''
#     Load Amazon Electronics product metadata which contain product information independent
#     of individual users from Hugging Face.

#     Parameters:
#       sample_size:= Number of reviews to sample for development

#     Returns:
#       pandas.DataFrame: DataFrame containing product metadata or None if loading fails.
#   '''
#   print("Loading Amazon Beauty product metadata...")
#   try:
#       # load directly
#       dataset = load_dataset(
#           "parquet",
#           data_files={
#               "train": "hf://datasets/McAuley-Lab/Amazon-Reviews-2023/raw_meta_Electronics/*.parquet"
#           }
#       )

#       # Convert dataset to DataFrame for easier manipulation
#       df = dataset["train"].to_pandas()

#       # Sample for dev if there is a specified sample size
#       if sample_size and sample_size < len(df):
#           df = df.sample(n = sample_size, random_state = SEED)
#           print(f"Sampled {len(df)} reviews")

#       return df

#   except Exception as e:
#       print(f"Direct parquet loading failed: {e}")
#       return None


In [None]:
# #Download and load user review parquet files manually
# def load_reviews(sample_size=None):
#   '''
#     Load Amazon Electronic customer generated product feedback from Hugging Face.

#     Parameters:
#       sample_size:= Number of reviews to sample for development

#     Returns:
#       pandas.DataFrame: DataFrame containing user reviews or None if loading fails.
#   '''
#   print("Loading Amazon Electronic product reviews...")
#   try:
#       # load directly
#       dataset = load_dataset(
#             "McAuley-Lab/Amazon-Reviews-2023",
#             "raw_review_Electronics",
#             split="train",
#             trust_remote_code=True
#         )

#       # Convert dataset to DataFrame for easier manipulation
#       df = dataset["train"].to_pandas()

#       # Sample for dev if there is a specified sample size
#       if sample_size and sample_size < len(df):
#           df = df.sample(n = sample_size, random_state = SEED)
#           print(f"Sampled {len(df)} reviews")

#       return df

#   except Exception as e:
#       print(f"Direct parquet loading failed: {e}")
#       return None


In [None]:
# def _is_html_tag(text: str) -> bool:
#     """
#     Identify whether a string in angle brackets is an actual HTML tag or necessary product information.

#     Parameters:
#       text: Text to analyze

#     Returns:
#       bool: True if it's an HTML tag, False if it's product description
#     """
#     # Ignore non-HTML tag-like strings
#     if not text.startswith('<') or not text.endswith('>'):
#         return False

#     # Remove the angle brackets
#     inner_content = text[1:-1].strip()

#     # Check for closing tags
#     if inner_content.startswith('/'):
#         inner_content = inner_content[1:].strip()

#     # List of all known HTML tag names
#     html_tag_names = {
#         'a', 'b', 'br', 'div', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
#         'hr', 'i', 'img', 'li', 'ol', 'p', 'span', 'strong', 'sub', 'sup',
#         'table', 'td', 'th', 'tr', 'u', 'ul'
#     }

#     # Extract the tag name
#     tag_name = inner_content.split()[0].lower() if inner_content else ""

#     # Remove any attributes or special characters
#     tag_name = re.sub(r'[^a-zA-Z]', '', tag_name)

#     return tag_name in html_tag_names



### Bert Classification Implementation

### Installations

In [10]:
%%capture
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

### Enviornment Configurations

In [12]:
%env UNSLOTH_DISABLE_FAST_GENERATION = 1

max_seq_length = 2048
dtype = None
load_in_4bit = False

env: UNSLOTH_DISABLE_FAST_GENERATION=1


In [13]:
# Label configuration
id2label = {0: "very negative", 1: "negative", 2: "neutral", 3: "positive", 4: "very positive"}
label2id = {"very negative": 0, "negative": 1, "neutral": 2, "positive": 3, "very positive": 4}

In [14]:
# Label mapping for data preprocessing
label_mapping = {
    'very negative': 0,
    'negative': 1,
    'neutral': 2,
    'positive': 3,
    'very positive': 4
}

### Model Loading

In [15]:
# Download ModernBERT-large and add classification head with 5 output neurons

model, tokenizer = FastModel.from_pretrained(
    model_name = 'answerdotai/ModernBERT-large',    # Loading pretrained model
    auto_model = AutoModelForSequenceClassification, # Load as sequence classification model
    max_seq_length = max_seq_length,
    dtype = dtype,
    num_labels = 5,   # Number of rating classes
    full_finetuning = True,   # Update all model parameters
    id2label = id2label,    # Defined label mappings
    label2id = label2id,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.9.7: Fast Modernbert patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using bfloat16 full finetuning which cuts memory usage by 50%.


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train & Test Split of Datasets

In [17]:
balanced_e_sentiment_reviews_dataset = pd.read_parquet(balanced_e_sentiment_reviews)
print(f'Total Rows: {len(balanced_e_sentiment_reviews_dataset)}')

Total Rows: 1068800


In [18]:
balanced_e_sentiment_reviews_dataset['labels'] = balanced_e_sentiment_reviews_dataset['sentiment_label'].map(label_mapping)

In [19]:
print("Label distribution:")
print(balanced_e_sentiment_reviews_dataset['labels'].value_counts().sort_index())
print("\nSentiment label distribution:")
print(balanced_e_sentiment_reviews_dataset['sentiment_label'].value_counts())

Label distribution:
labels
0    213760
1    213760
2    213760
3    213760
4    213760
Name: count, dtype: int64

Sentiment label distribution:
sentiment_label
very positive    213760
very negative    213760
negative         213760
neutral          213760
positive         213760
Name: count, dtype: int64


In [24]:
balanced_e_sentiment_reviews_dataset[:5]

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,sentiment_label,labels
0,5.0,Maple leaf attache,This is a beautiful item. I get lots of compliments. It's functional and everything I wanted.,[],B06XD94B2F,B06XD94B2F,AHPUT3ITXCHQJO7OMF74LEMYHIVA,1540745403758,0,True,very positive,4
1,5.0,What a beautiful TV. The price was fair,"What a beautiful TV. The price was fair. I original ordered a smaller model, but since it was backordered, the seller offered me this model at the same price as the less expe...",[],B0036WT3ZM,B0036WT3ZM,AHZ6XMOLEWA67S3TX7IWEXXGWSOA,1306350894000,0,True,very positive,4
2,5.0,travel cable organizer,"Husband travels 300 days a year. If you travel at all, you can relate to the shear volume of cables you have to carry with you. He LOVES this organizer because everything is ...",[],B01NCVI872,B09CD66HJM,AFZUK3MTBIBEDQOPAK3OATUOUKLA,1494374095000,20,True,very positive,4
3,5.0,Great case,My daughter loves her case with the pen holder. It fits her tablet perfectly and she likes the fact that she can store her pen where she can find it,[],B07CXQD16L,B07CHC71M2,AGF42GID7QWDCNFTJRCTMKAITJJA,1578756182340,0,True,very positive,4
4,5.0,Camera's,I really like them. Glad i purchased them.,[],B07X27VK3D,B0BWD4WGJB,AFSGQCEINAGMWU3MAZZKWU2UGGKQ,1667066337092,0,True,very positive,4


In [82]:
sample_size = 60000
sampled_df = balanced_e_sentiment_reviews_dataset.groupby('sentiment_label').apply(
    lambda x: x.sample(n=min(len(x), sample_size // 5), random_state=SEED)
).reset_index(drop=True)

  sampled_df = balanced_e_sentiment_reviews_dataset.groupby('sentiment_label').apply(


In [28]:
print("Sampled data distribution:")
print(sampled_df['sentiment_label'].value_counts(normalize=True))

Sampled data distribution:
sentiment_label
negative         0.2
neutral          0.2
positive         0.2
very negative    0.2
very positive    0.2
Name: proportion, dtype: float64


In [29]:
# Create train & test splits
balanced_e_sentiment_reviews_train_df, balanced_e_sentiment_reviews_test_df = train_test_split(
    sampled_df,
    test_size=0.2,
    stratify=sampled_df['sentiment_label'],
    random_state=SEED
)

In [30]:
print(f"Train Rows: {len(balanced_e_sentiment_reviews_train_df)}")
print(f"Test Rows: {len(balanced_e_sentiment_reviews_test_df)}")

Train Rows: 48000
Test Rows: 12000


In [31]:
# Convert to HuggingFace datasets
balanced_e_sentiment_reviews_train_dataset = Dataset.from_pandas(balanced_e_sentiment_reviews_train_df[['text', 'labels']])
balanced_e_sentiment_reviews_test_dataset = Dataset.from_pandas(balanced_e_sentiment_reviews_test_df[['text', 'labels']])


In [32]:
def tokenize_function(examples):
  '''
  Convert text into numerical tokens that the modek can process
  '''
  return tokenizer(examples['text'])

In [33]:
# Tokenize train & test splits
balanced_e_sentiment_reviews_train_dataset = balanced_e_sentiment_reviews_train_dataset.map(tokenize_function, batched=True)
balanced_e_sentiment_reviews_test_dataset = balanced_e_sentiment_reviews_test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [34]:
# Analyzes the distribution of labels in training data
labels = balanced_e_sentiment_reviews_train_dataset['labels']
# Get array of weights per class
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
print(f"Class weights: {class_weights}")

Class weights: [1. 1. 1. 1. 1.]


# Evaluation

In [35]:
def compute_metrics(eval_pred):
  '''
  Calculate accuracy of predictions

  Parameters:
    eval_pred: Tuple containing predictions and the true labels

  Returns:
    Dictionary containing accuracy
  '''
  logits, labels = eval_pred
  predictions = logits.argmax(axis=-1)
  return {"accuracy": accuracy_score(labels, predictions)}


# Model Training

In [47]:
trainer = Trainer(
    model=model,
    processing_class = tokenizer,
    eval_dataset = balanced_e_sentiment_reviews_test_dataset,
    train_dataset = balanced_e_sentiment_reviews_train_dataset,
    compute_metrics = compute_metrics,
    args = TrainingArguments(
        per_device_train_batch_size = 64,
        gradient_accumulation_steps = 1,
        warmup_steps = 750,
        num_train_epochs = 3,
        learning_rate = 3e-5,   # For optimal BERT training
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        eval_strategy="steps",
        eval_steps=0.05,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    )
)

In [48]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 48,000 | Num Epochs = 3 | Total steps = 2,250
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 1 x 1) = 64
 "-____-"     Trainable parameters = 395,836,421 of 395,836,421 (100.00% trained)


Step,Training Loss,Validation Loss,Accuracy
113,0.5954,0.882558,0.62
226,0.7765,0.897073,0.621167
339,0.7304,0.931275,0.618917
452,0.8702,0.891202,0.621667
565,0.9116,0.883451,0.616417
678,0.945,0.874983,0.62325
791,0.6976,0.891126,0.61925
904,0.5188,0.891884,0.621333
1017,0.7403,0.89293,0.627917
1130,0.8004,0.8841,0.62825


# Testing

In [49]:
test0 = "I wish I bought this phone sooner, it's so easy to use!"
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print(classifier(test0))

# Test sentiment and rating disagreements
test1 = "I hate the way the flattening iron smells but my hair genuinley looks perfect"
print(classifier(test1))

Device set to use cuda:0


[{'label': 'very positive', 'score': 0.9370497465133667}]
[{'label': 'positive', 'score': 0.44836586713790894}]


In [54]:
# Test clear examples of each class
extended_examples = [
    # Very Positive
    "I love this! Exceeded all my expectations!",
    "Fantastic quality and performance, highly recommend!",
    "Best thing I've bought this year, completely satisfied!",
    "Absolutely perfect, can't imagine life without it!",
    "This product blew me away, incredible experience!",
    "Top-notch quality, I’m extremely happy!",
    "Exceeded my expectations, amazing value for money!",
    "Absolutely delighted with this purchase!",
    "Perfect in every way, highly satisfied!",
    "Outstanding product, I would buy again!",
    "So happy with this, worth every penny!",
    "This is amazing, I can’t stop using it!",
    "Five stars all the way, absolutely love it!",
    "Exceptional product, highly recommend to everyone!",
    "I am thrilled with this, totally satisfied!",
    "Incredible experience, couldn’t be happier!",
    "This is exactly what I needed, fantastic!",
    "Highly impressed, would definitely recommend!",
    "Amazing quality and performance!",
    "This is a masterpiece, very happy with it!",

    # Positive
    "Pretty good, does the job well",
    "Satisfied with my purchase, works as advertised",
    "Good product overall, happy with it",
    "Nice quality, meets my expectations",
    "Works well, glad I bought it",
    "Happy with the product, it’s decent",
    "Good value for money, pleased with it",
    "Does what it says, pretty satisfied",
    "Enjoying this product, meets my needs",
    "Product works fine, nothing wrong with it",
    "Nice and reliable, happy with the purchase",
    "I like it, works as I expected",
    "Product is good quality, meets my expectations",
    "Works well, satisfied overall",
    "Pretty happy with it, would recommend",
    "Good experience, product does its job",
    "Works as intended, satisfied with it",
    "Product quality is decent, happy with purchase",
    "Solid performance, glad I bought it",
    "Pleasantly surprised, works well",

    # Neutral
    "It’s okay, nothing special",
    "Average product, meets basic needs",
    "Neither good nor bad, just fine",
    "It does the job, but nothing more",
    "Neutral experience, expected better",
    "Fine for casual use, not amazing",
    "It’s acceptable, nothing impressive",
    "Mediocre quality, but usable",
    "Does the job, no complaints, no praise",
    "Just an average product, nothing to write home about",
    "Meets minimum expectations, not outstanding",
    "It’s fine, gets the work done",
    "Product is adequate, nothing special",
    "Neutral feelings, okay performance",
    "Neither impressed nor disappointed",
    "It works, but nothing remarkable",
    "Average experience, neither happy nor unhappy",
    "Acceptable for the price, nothing extraordinary",
    "Works as expected, just ordinary",
    "Nothing notable, meets basic expectations",

    # Negative
    "Poor quality, wouldn’t recommend",
    "Disappointed, didn’t meet expectations",
    "Not worth the price, poor performance",
    "Subpar product, quite dissatisfied",
    "Doesn’t work as advertised, unhappy",
    "Quality is lacking, not happy with it",
    "Product underwhelming, would not buy again",
    "I expected better, disappointing",
    "Unsatisfactory experience, low quality",
    "Not impressed, poor performance",
    "Regret buying this, quality is bad",
    "Product failed to meet basic standards",
    "Below average, dissatisfied with it",
    "Not happy, doesn’t perform well",
    "Bad quality, not worth it",
    "Disappointing, expected more",
    "Mediocre at best, would not recommend",
    "Poor construction, low durability",
    "Unhappy with this purchase, quality lacking",
    "Substandard product, avoid if possible",

    # Very Negative
    "Terrible waste of money, complete garbage!",
    "Absolutely awful, do not buy!",
    "Horrible experience, extremely dissatisfied",
    "Worst purchase ever, totally regret it",
    "This product is a nightmare, complete junk",
    "Utterly useless, I want a refund",
    "Completely disappointed, worst quality",
    "Disaster of a product, do not recommend",
    "Absolute trash, very unhappy",
    "Do not buy this, completely broken",
    "Terrible quality, barely works at all",
    "Worst product I have ever purchased",
    "Avoid at all costs, extremely poor",
    "Extremely dissatisfied, total waste",
    "Horrendous, not functional at all",
    "Complete garbage, do not waste money",
    "Pathetic product, very frustrated",
    "Absolutely unacceptable, extremely poor",
    "This is the worst, do not buy it",
    "Completely terrible, total disappointment"
]


In [57]:
i = 1
for example in extended_examples:
    print(f'{i}. {classifier(example)}')
    i += 1

1. [{'label': 'very positive', 'score': 0.9328212738037109}]
2. [{'label': 'very positive', 'score': 0.9776745438575745}]
3. [{'label': 'very positive', 'score': 0.9879835844039917}]
4. [{'label': 'very positive', 'score': 0.9426705837249756}]
5. [{'label': 'very positive', 'score': 0.9746134281158447}]
6. [{'label': 'very positive', 'score': 0.9777745008468628}]
7. [{'label': 'very positive', 'score': 0.9451207518577576}]
8. [{'label': 'very positive', 'score': 0.9466958045959473}]
9. [{'label': 'very positive', 'score': 0.9613246321678162}]
10. [{'label': 'very positive', 'score': 0.9779735207557678}]
11. [{'label': 'very positive', 'score': 0.9475758075714111}]
12. [{'label': 'very positive', 'score': 0.9223365187644958}]
13. [{'label': 'very positive', 'score': 0.9953573346138}]
14. [{'label': 'very positive', 'score': 0.964774489402771}]
15. [{'label': 'very positive', 'score': 0.8917246460914612}]
16. [{'label': 'very positive', 'score': 0.9444452524185181}]
17. [{'label': 'very 

# Save Model

In [53]:
# Save necessary items to reload and use model
model.save_pretrained("model")
tokenizer.save_pretrained("model")

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/tokenizer.json')

# Find Text-Sentiment-Rating Contradictions

# Model Preparation & Device Configuration

In [95]:
# Switch neural network to evaluation mode to prevent random neuron deactivation during inference
model.eval()

# Detect if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move trained modl weights to selected device
model.to(device)
print(f"Using device: {device}")

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 1024, padding_idx=50283)
      (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=1024, out_features=3072, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=1024, out_features=1024, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=1024, out_features=5248, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=2624, out_features=1024, bias=False)
        

# Batch Processing Configuration

In [119]:
texts = sampled_df['text'].tolist()
print(f"Processing {len(texts)} reviews in batches of {batch_size}")

Processing 60000 reviews


In [97]:
# Simultaneous batch processing of 64 reviews
batch_size = 64
# Store sentiment predictions
all_preds = []
# Store confidence levels for each prediction
all_confidence_scores = []

# Sentiment Analysis Processing

In [99]:
# Disable gradient calc for faster inference
with torch.no_grad():
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i+batch_size]

        # Tokenize batch
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=2048,  # Match models max_seq_length
            return_tensors="pt"
        ).to(device)

        # Execute sentiment analysis on current batch
        outputs = model(**inputs)
        logits = outputs.logits

        # Get the predicted sentiment category
        batch_preds = torch.argmax(logits, dim=1)

        # Get confidence scores
        probabilities = torch.softmax(logits, dim=1)    # Convert raw score to probabilities
        confidence_scores = torch.max(probabilities, dim=1)[0]

        # Store results as numpy array
        all_pred.extend(batch_preds.cpu().numpy())
        all_confidence_scores.extend(confidence_scores.cpu().numpy())

Processing batches: 100%|██████████| 938/938 [04:24<00:00,  3.55it/s]


# Results

In [121]:
sampled_df['predicted_sentiment'] = [id2label[pred] for pred in all_preds]
sampled_df['prediction_confidence'] = all_confidence_scores

print(f"Sentiment distribution in customer reviews:")
print(sampled_df['predicted_sentiment'].value_counts())
print(f"\nAverage prediction confidence: {sampled_df['prediction_confidence'].mean():.3f}")

Sentiment distribution in customer reviews:
predicted_sentiment
very positive    12914
negative         12257
very negative    12107
neutral          11784
positive         10938
Name: count, dtype: int64

Average prediction confidence: 0.714


# Analyze Text-Sentiment-Rating Contradictions

In [122]:
def identify_disagreements(df):
  '''
  Identify cases where the customer ratings don't match the predicted sentiment.

  Parameters:
    df: Dataframe analyzed

  Returns:
    Dataframe containing cases where the customer ratings don't match the predicted sentiment.
  '''

  high_rating_negative_sentiment = df[
      (df['rating'] >= 4) & (df['predicted_sentiment'].isin(['negative', 'very negative']))
  ]

  low_rating_positive_sentiment = df[
      (df['rating'] <= 2) & (df['predicted_sentiment'].isin(['positive', 'very positive']))
  ]

  extreme_rating_neutral_sentiment = df[
      ((df['rating'] == 5) | (df['rating'] == 1)) & (df['predicted_sentiment'] == 'neutral')
  ]

  # Combine contradictory cases
  disagreements = pd.concat([
      high_rating_negative_sentiment,
      low_rating_positive_sentiment,
      extreme_rating_neutral_sentiment
  ])

  return disagreements, high_rating_negative_sentiment, low_rating_positive_sentiment, extreme_rating_neutral_sentiment

In [124]:
# Execute disagreement analysis
disagreements, high_rated_negative, low_rated_positive, extreme_rated_neutral = identify_disagreements(sampled_df)

### AI Generated Organized Business Summary

In [126]:
# Business Intelligence Report: Customer Sentiment Insights
print("\n" + "="*65)
print("CUSTOMER SENTIMENT ANALYSIS - BUSINESS INTELLIGENCE SUMMARY")
print("="*65)

# Dataset overview
print("\nDATASET OVERVIEW")
print(f"- Total customer reviews analyzed: {len(sampled_df):,}")
print(f"- Average prediction confidence: {sampled_df['prediction_confidence'].mean():.1%}")

# Sentiment breakdown
print("\nSENTIMENT DISTRIBUTION")
sentiment_counts = sampled_df['predicted_sentiment'].value_counts()
for sentiment, count in sentiment_counts.items():
    percentage = (count / len(sampled_df)) * 100
    print(f"- {sentiment.title():<12}: {count:,} reviews ({percentage:.1f}%)")

# Disagreement analysis
print("\nDISAGREEMENT ANALYSIS")
print(f"- Total disagreement cases found: {len(disagreements):,}")
print(f"- Disagreement rate: {(len(disagreements)/len(sampled_df)*100):.1f}%")

print("\nKEY BREAKDOWN OF DISAGREEMENTS")
print(f"- High-rated but negative sentiment: {len(high_rated_negative):,} cases")
print("  → May indicate hidden product or service issues")
print(f"- Low-rated but positive sentiment: {len(low_rated_positive):,} cases")
print("  → Suggests redeeming product qualities despite poor overall experience")
print(f"- Extreme-rated (1 or 5) but neutral sentiment: {len(extreme_rated_neutral):,} cases")
print("  → Points to nuanced customer experiences behind extreme ratings")

# Sample disagreement cases
if len(disagreements) > 0:
    print("\nSAMPLE DISAGREEMENT CASES")
    for idx, (i, row) in enumerate(disagreements.head(3).iterrows()):
        print(f"\n{idx+1}. Rating: {row['rating']} stars | Predicted: {row['predicted_sentiment']}")
        print(f"   Review excerpt: {row['text'][:100]}...")



CUSTOMER SENTIMENT ANALYSIS - BUSINESS INTELLIGENCE SUMMARY

DATASET OVERVIEW
- Total customer reviews analyzed: 60,000
- Average prediction confidence: 71.4%

SENTIMENT DISTRIBUTION
- Very Positive: 12,914 reviews (21.5%)
- Negative    : 12,257 reviews (20.4%)
- Very Negative: 12,107 reviews (20.2%)
- Neutral     : 11,784 reviews (19.6%)
- Positive    : 10,938 reviews (18.2%)

DISAGREEMENT ANALYSIS
- Total disagreement cases found: 1,215
- Disagreement rate: 2.0%

KEY BREAKDOWN OF DISAGREEMENTS
- High-rated but negative sentiment: 323 cases
  → May indicate hidden product or service issues
- Low-rated but positive sentiment: 259 cases
  → Suggests redeeming product qualities despite poor overall experience
- Extreme-rated (1 or 5) but neutral sentiment: 633 cases
  → Points to nuanced customer experiences behind extreme ratings

SAMPLE DISAGREEMENT CASES

1. Rating: 4.0 stars | Predicted: negative
   Review excerpt: They sound great but aren't particularly comfortable....

2. Rating:

# LLM Disagreement Accuracy Analysis

## Model Setup

In [131]:
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [226]:
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY_HERE"

In [176]:
def test_llm_sentiment(review_text, rating):
    """
    Use an OpenAI LLM to classify sentiment.
    Optionally provide the actual rating for context.
    """
    system_prompt = (
    f"""Analyze this Amazon review for sentiment-rating disagreement:

        Review: "{review_text}"
        Customer Rating: {rating}/5 stars

        Question: Does the sentiment in the review match the star rating?
        Answer with: FOUND_MATCH or FOUND_MISMATCH, then briefly explain why."""
    )

    user_prompt = f"Review: {review_text}"
    user_prompt += f"\nActual Rating: {rating}"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )

    return response.choices[0].message.content.strip()

In [184]:
matched_sentiments = sampled_df.query('sentiment_label == predicted_sentiment')
matched_sentiments = matched_sentiments.sample(n=20, random_state=SEED)

In [186]:
disagreements_sample = disagreements.sample(n=20, random_state=SEED)

In [187]:
disagree_matches_mix_sentiments = pd.concat([disagreements_sample, matched_sentiments], ignore_index=True)

In [188]:
disagree_matches_mix_sentiments.shape

(40, 14)

In [189]:
def compare_bert_vs_llm(disagreement_sample, n_cases=20):
    """
    Compare BERT and LLM approaches on the same disagreement cases

    Parameters:
      disagreement_sample: Dataframe containing disagreement cases
      n_cases: Number of cases to compare

    Returns:
      Dataframe containing comparison results
    """
    comparison_results = []

    # Test on certain # of disagreement samples
    sample_cases = disagreement_sample.head(n_cases)

    for idx, row in sample_cases.iterrows():
        review_text = row['text']
        actual_rating = row['rating']
        bert_prediction = row['predicted_sentiment']
        bert_confidence = row['prediction_confidence']

        # Get LLM analysis of rating-text sentiment discrepancies
        try:
            llm_result = test_llm_sentiment(review_text, actual_rating)
            llm_status = "Success"
        except Exception as e:
            llm_result = f"Error: {str(e)}"
            llm_status = "Failed"

        # Store comparison
        comparison_results.append({
            'review_text': review_text[:100] + "...",
            'actual_rating': actual_rating,
            'bert_sentiment': bert_prediction,
            'bert_confidence': round(bert_confidence, 3),
            'bert_found_disagreement': True,    # All are disagreement cases
            'llm_analysis': llm_result,
            'llm_status': llm_status
        })

        # Print results
        print(f"\n--- Case {len(comparison_results)} ---")
        print(f"Review: {review_text[:100]}...")
        print(f"Actual Rating: {actual_rating}/5 stars")
        print(f"BERT: {bert_prediction} (confidence: {bert_confidence:.3f})")
        print(f"LLM: {llm_result}")
        print(f"Status: {llm_status}")

    return pd.DataFrame(comparison_results)

# Run the comparison
comparison_df = compare_bert_vs_llm(disagree_matches_mix_sentiments, n_cases=40)


--- Case 1 ---
Review: Update -- This hard drive recently crashed on me and I have no way  of recovering any of my files......
Actual Rating: 1.0/5 stars
BERT: neutral (confidence: 0.489)
LLM: FOUND_MISMATCH. The review contains both positive and negative sentiments. While the reviewer expresses satisfaction with the hard drive's performance and transfer speeds, they also highlight significant issues, particularly the crash and the ineffective auto back-up feature. The overall negative experience, especially the data loss, aligns with the low star rating of 1.0. However, the positive comments create a contradiction with the harsh rating, indicating a mismatch in sentiment and rating.
Status: Success

--- Case 2 ---
Review: Like other aftermarket cords, two of the five in the pack don’t work!...
Actual Rating: 1.0/5 stars
BERT: neutral (confidence: 0.598)
LLM: FOUND_MATCH: The sentiment in the review matches the star rating. The reviewer expresses dissatisfaction by stating that two ou

In [190]:
def analyze_comparison_results(comparison_df):
    """
    Analyze how BERT and LLM agree/disagree on cases

    Parameters:
      comparison_df: Dataframe containing Bert vs. LLM comparison results

    Returns:
      Relationship between BERT and LLM models sentiment analysis
    """

    print("="*60)
    print("BERT vs LLM COMPARISON ANALYSIS")
    print("="*60)

    # Count successful LLM responses
    successful_llm = comparison_df[comparison_df['llm_status'] == 'Success']
    print(f"\nSuccessful LLM responses: {len(successful_llm)}/{len(comparison_df)}")

    # Analyze LLM responses
    llm_matches = successful_llm[successful_llm['llm_analysis'].str.contains('FOUND_MATCH', case=False, na=False)]
    llm_mismatches = successful_llm[successful_llm['llm_analysis'].str.contains('FOUND_MISMATCH', case=False, na=False)]

    print(f"LLM found MATCH: {len(llm_matches)} cases")
    print(f"LLM found MISMATCH: {len(llm_mismatches)} cases")

    # Since these are all BERT-identified disagreement cases:
    print(f"\nBERT identified ALL {len(comparison_df)} cases as disagreements")
    print(f"LLM agreed on {len(llm_mismatches)} disagreements")
    print(f"LLM disagreed on {len(llm_matches)} cases (found them as matches)")

    if len(successful_llm) > 0:
        agreement_rate = len(llm_mismatches) / len(successful_llm) * 100
        print(f"\nBERT-LLM Agreement Rate: {agreement_rate:.1f}%")

    return {
        'total_cases': len(comparison_df),
        'successful_llm': len(successful_llm),
        'llm_matches': len(llm_matches),
        'llm_mismatches': len(llm_mismatches),
        'agreement_rate': agreement_rate if len(successful_llm) > 0 else 0
    }

# Run analysis
analysis_results = analyze_comparison_results(comparison_df)

BERT vs LLM COMPARISON ANALYSIS

Successful LLM responses: 40/40
LLM found MATCH: 16 cases
LLM found MISMATCH: 24 cases

BERT identified ALL 40 cases as disagreements
LLM agreed on 24 disagreements
LLM disagreed on 16 cases (found them as matches)

BERT-LLM Agreement Rate: 60.0%


In [201]:
negative_reviews = sampled_df[
    ((sampled_df['predicted_sentiment'] == 'negative') & (sampled_df['sentiment_label'] == 'negative')) |
    ((sampled_df['predicted_sentiment'] == 'very negative') & (sampled_df['sentiment_label'] == 'very negative'))
]

In [206]:
disagreements_negative_reviews = pd.concat([disagreements, negative_reviews], ignore_index=True)
disagreements_negative_reviews = disagreements_negative_reviews.sample(frac=1, random_state=SEED).reset_index(drop=True)

In [210]:
# Simple manuel insight extraction
disagreement_keywords = {
    'shipping': ['shipping', 'delivery', 'arrived', 'package', 'box'],
    'quality': ['quality', 'build', 'cheap', 'flimsy', 'durable'],
    'battery': ['battery', 'charge', 'power', 'drain'],
    'packaging': ['packaging', 'damaged', 'broken in box']
}

# Count mentions in disagreement and negative review cases
for category, keywords in disagreement_keywords.items():
    count = sum(1 for text in disagreements_negative_reviews['text']
                if any(word in text.lower() for word in keywords))
    print(f"{category}: {count} mentions in disagreements")

shipping: 1338 mentions in disagreements
quality: 2808 mentions in disagreements
battery: 2794 mentions in disagreements
packaging: 249 mentions in disagreements


In [211]:
# Simple manuel insight extraction
disagreement_keywords = {
    'shipping': ['shipping', 'delivery', 'arrived', 'package', 'box'],
    'quality': ['quality', 'build', 'cheap', 'flimsy', 'durable'],
    'battery': ['battery', 'charge', 'power', 'drain'],
    'packaging': ['packaging', 'damaged', 'broken in box']
}

# Count mentions in negative review cases
for category, keywords in disagreement_keywords.items():
    count = sum(1 for text in negative_reviews['text']
                if any(word in text.lower() for word in keywords))
    print(f"{category}: {count} mentions in disagreements")

shipping: 1251 mentions in disagreements
quality: 2682 mentions in disagreements
battery: 2662 mentions in disagreements
packaging: 241 mentions in disagreements


# Export Datasets for Feature Extraction

In [212]:
# Prepare datasets for feature extraction work
analysis_datasets = {
    'disagreements': disagreements,
    'very_positive': sampled_df[sampled_df['predicted_sentiment'] == 'very positive'],
    'very_negative': sampled_df[sampled_df['predicted_sentiment'] == 'very negative'],
    'neutral': sampled_df[sampled_df['predicted_sentiment'] == 'neutral'],
    'positive': sampled_df[sampled_df['predicted_sentiment'] == 'positive'],
    'negative': sampled_df[sampled_df['predicted_sentiment'] == 'negative']
}

# Save datasets for feature extraction team
print("Exporting datasets for feature extraction analysis...")
for dataset_name, dataset in analysis_datasets.items():
    filename = f"{dataset_name}_reviews.parquet"
    dataset.to_parquet(filename, index=False)
    print(f"- Exported {len(dataset):,} {dataset_name} reviews to {filename}")

Exporting datasets for feature extraction analysis...
- Exported 1,215 disagreements reviews to disagreements_reviews.parquet
- Exported 12,914 very_positive reviews to very_positive_reviews.parquet
- Exported 12,107 very_negative reviews to very_negative_reviews.parquet
- Exported 11,784 neutral reviews to neutral_reviews.parquet
- Exported 10,938 positive reviews to positive_reviews.parquet
- Exported 12,257 negative reviews to negative_reviews.parquet

Handoff complete! Feature extraction team can now analyze:
- Which product features are mentioned in disagreement cases
- What drives very positive customer experiences
- What causes very negative customer experiences
- Neutral review patterns for improvement opportunities
