# Exploratory Data Analysis (EDA) for the Amazon Customer Reviews Dataset

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from datasets import load_dataset, get_dataset_config_names

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


## Data Loading (Amazon Reviews Dataset 2023)
The dataset is loaded from the Hugging Face datasets library.

In [55]:
# Select 3 categories to highlight cross-domain transfer
TARGET_DOMAINS = ["Books", "Movies_and_TV", "Video_Games"]
MIN_USER_INTERACTIONS = 5
MIN_ITEM_INTERACTIONS = 5
POSITIVE_THRESHOLD = 4.0  # Ratings >= 4.0 are considered positive

# Load the dataset
def load_amazon_reviews(domain:str, max_per_domain:int=100000) -> pd.DataFrame:
    dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023",
                           f"raw_review_{domain}",
                           trust_remote_code=True)
    rows = []
    for i, r in enumerate(dataset["full"]):
        if i >= max_per_domain:
            break
        rows.append({
            "user": r["user_id"],
            "item": r["parent_asin"],
            "rating": float(r["rating"]),
            "title": r.get("title", ""),
            "text": r.get("text", ""),
            "domain": domain,
            "timestamp": int(r["timestamp"])
        })
    return pd.DataFrame(rows)

dfs = [load_amazon_reviews(dom, max_per_domain=100000) for dom in TARGET_DOMAINS]
df = pd.concat(dfs, ignore_index=True)



KeyboardInterrupt: 

## Filtering dataset
* Remove low ratings: We consider ratings >= 4.0 as positive.
* Remove exact duplicates: Ensure that each user-item interaction is unique.
* Reduce item sparsity: Ensure that every remaining item has at least a certain number of interactions.
* Reduce user sparsity: Ensure that every user has at least a certain number of interactions.

In [51]:
# Remove low ratings, as we are building a model for positive recommendations (make implicit feedback)
def remove_low_ratings(dataframe, min_rating=4.0, keep_rating=True):
    dataframe["label"] = (dataframe["rating"] >= min_rating).astype(int)
    if not keep_rating:
        dataframe = dataframe.drop(columns=["rating"])
    return dataframe

df = remove_low_ratings(df, POSITIVE_THRESHOLD, keep_rating=True)
df.sample(5)

Unnamed: 0,user,item,rating,title,text,domain,timestamp,label
139464,AH4BUMIEYHXETPZQADPZTHRZUQYQ,B001TM5B16,3.0,Fun to see again,This film has held up through the years. Fun ...,Movies_and_TV,1470787719000,0
294453,AEYSXJ5XMB2M3FONLYZUEKHVGNIA,B0055Q175G,5.0,great game,Good game,Video_Games,1483750969000,1
290648,AE4NXURNTQQY3CJWRTJQMBJR7HLA,B07ZPF625Q,5.0,Dynamic storyline.,"Bought as a gift.<br />I own a copy, and was i...",Video_Games,1644095084942,1
190005,AG7REHZEXAXTTYKS2WHSDQC2YSDQ,B001A4VH2U,4.0,House,Love the series now I can see them all in orde...,Movies_and_TV,1366576399000,1
252001,AGXCUHMHZTQ62JBWQFHCF4JBV7JQ,B000N5Z2L4,5.0,The best price for Xbox Live Gold you'll find,"A must have for any gamer. For some reason, th...",Video_Games,1385043952000,1


In [52]:
# Reduce item sparsity to ensure that every remaining item has at least certain number
# of interactions, and that every user has at least certain number of interactions
def filter_interactions(dataframe, min_user_interactions, min_item_interactions):
    # Removing exact duplicates first
    print(f"Dataframe before removing exact duplicates: {len(dataframe)}")
    dataframe = dataframe.drop_duplicates()
    print(f"Dataframe before interactions filtering: {len(dataframe)}")

    while True:
        before = len(dataframe)
        # Count interactions per user and item
        user_counts = dataframe.groupby("user")["item"].count()
        item_counts = dataframe.groupby("item")["user"].count()
        # Filter users and items based on interaction counts
        users_to_keep = set(user_counts[user_counts >= min_user_interactions].index)
        items_to_keep = set(item_counts[item_counts >= min_item_interactions].index)
        # Filter the DataFrame
        dataframe = dataframe[dataframe["user"].isin(users_to_keep) & dataframe["item"].isin(items_to_keep)]
        after = len(dataframe)
        if after == before:
            break
    print(f"Dataframe after interactions filtering: {after}")
    return dataframe

df = filter_interactions(df, MIN_USER_INTERACTIONS, MIN_ITEM_INTERACTIONS)

Dataframe before removing exact duplicates: 300000
Dataframe before interactions filtering: 298841
Dataframe after interactions filtering: 16514


## Exploratory Data Analysis (EDA)
* Basic statistics
* Rating distribution
* User and item statistics
* Cross-domain user overlap
* Sparsity analysis

### Basic statistics

In [53]:
print("Total rows:", len(df))
print("Domains:", df["domain"].value_counts().to_dict())

Total rows: 16514
Domains: {'Video_Games': 12291, 'Movies_and_TV': 4223}
