# Exploratory Data Analysis (EDA) for the Amazon Customer Reviews Dataset

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from datasets import load_dataset, get_dataset_config_names

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


## Data Loading (Amazon Reviews Dataset 2023)
The dataset is loaded from the Hugging Face datasets library.

In [43]:
# Select 3 categories to highlight cross-domain transfer
TARGET_DOMAINS = ["Digital_Music", "Movies_and_TV", "Video_Games"]
MIN_USER_INTERACTIONS = 5
MIN_ITEM_INTERACTIONS = 5
POSITIVE_THRESHOLD = 4.0  # Ratings >= 4.0 are considered positive

# Load the dataset
def load_amazon_reviews(domains, max_per_domain=100000):
    dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023",
                           f"raw_review_{domains}",
                           trust_remote_code=True)

    rows = []
    for i, r in enumerate(dataset["full"]):
        if i >= max_per_domain:
            break
        rows.append({
            "user": r["user_id"],
            "item": r["parent_asin"],
            "rating": float(r["rating"]),
            "title": r.get("title", ""),
            "text": r.get("text", ""),
            "timestamp": int(r["timestamp"])
        })

    return pd.DataFrame(rows)

dfs = [load_amazon_reviews(dom, max_per_domain=100000) for dom in TARGET_DOMAINS]
df = pd.concat(dfs, ignore_index=True)

## Filtering dataset
* Remove low ratings: We consider ratings >= 4.0 as positive.
* Remove exact duplicates: Ensure that each user-item interaction is unique.
* Reduce item sparsity: Ensure that every remaining item has at least a certain number of interactions.
* Reduce user sparsity: Ensure that every user has at least a certain number of interactions.

In [47]:
# Remove low ratings, as we are building a model for positive recommendations (make implicit feedback)
def remove_low_ratings(dataframe, min_rating=4.0, keep_rating=True):
    dataframe["label"] = (dataframe["rating"] >= min_rating).astype(int)
    if not keep_rating:
        dataframe = dataframe.drop(columns=["rating"])
    return dataframe

df = remove_low_ratings(df, POSITIVE_THRESHOLD, keep_rating=True)
df.sample(5)

Unnamed: 0,user,item,rating,title,text,timestamp,label
266815,AH4Z3HHFZJVIZPR23TP4A3MUYZNQ,B0006B0O9U,5.0,Been looking for this forever,Been looking for this game for awhile and it's...,1412785356000,1
245611,AG6URUZFUDVFKLM7MXR3GE7HUEOA,B0030MEITE,4.0,Four Stars,not bad,1407196253000,1
217326,AFFQ4CDRBM54WVYD2XLV5YVT2MGA,B007X5103Q,5.0,Excellent!,Great game if you like the walking dead series...,1366164990000,1
233799,AFHANVG64R3V6BNS33WJYEBJYXQA,B001ELJE5G,5.0,Hours of Fun,"I watch my son play this game. If he could, I...",1262191867000,1
225458,AGD2KE77JSUWQKD5CGYVGCQYJPHQ,B00BGA9WK2,5.0,Love the PS4,I've wanted a PS4 since they came out. I was ...,1421001873000,1


In [48]:
# Reduce item sparsity to ensure that every remaining item has at least certain number
# of interactions, and that every user has at least certain number of interactions
def filter_interactions(dataframe, min_user_interactions, min_item_interactions):
    # Removing exact duplicates first
    print(f"Dataframe before removing exact duplicates: {len(dataframe)}")
    dataframe = dataframe.drop_duplicates()
    print(f"Dataframe before interactions filtering: {len(dataframe)}")

    while True:
        before = len(dataframe)
        # Count interactions per user and item
        user_counts = dataframe.groupby("user")["item"].count()
        item_counts = dataframe.groupby("item")["user"].count()
        # Filter users and items based on interaction counts
        users_to_keep = set(user_counts[user_counts >= min_user_interactions].index)
        items_to_keep = set(item_counts[item_counts >= min_item_interactions].index)
        # Filter the DataFrame
        dataframe = dataframe[dataframe["user"].isin(users_to_keep) & dataframe["item"].isin(items_to_keep)]
        after = len(dataframe)
        if after == before:
            break
    print(f"Dataframe after interactions filtering: {after}")
    return dataframe

df = filter_interactions(df, MIN_USER_INTERACTIONS, MIN_ITEM_INTERACTIONS)

Dataframe before removing exact duplicates: 16514
Dataframe before interactions filtering: 16514
Dataframe after interactions filtering: 16514


Unnamed: 0,user,item,rating,title,text,timestamp
55560,AEJZE4VZLLYKJQD2EYTUHN76ITZQ,B000MNP2XU,4.0,I have previous cd and like his music so I am ...,I have previous cd and like his music so I am ...,1408310605000
55561,AEJZE4VZLLYKJQD2EYTUHN76ITZQ,B000MNP2XU,4.0,I have previous cd and like his music so I am ...,I have previous cd and like his music so I am ...,1408310605000
55562,AEJZE4VZLLYKJQD2EYTUHN76ITZQ,B000MNP2XU,4.0,I have previous cd and like his music so I am ...,I have previous cd and like his music so I am ...,1408310605000
55563,AEJZE4VZLLYKJQD2EYTUHN76ITZQ,B000MNP2XU,4.0,I have previous cd and like his music so I am ...,I have previous cd and like his music so I am ...,1408310605000
55564,AEJZE4VZLLYKJQD2EYTUHN76ITZQ,B000MNP2XU,4.0,I have previous cd and like his music so I am ...,I have previous cd and like his music so I am ...,1408310605000
55565,AEJZE4VZLLYKJQD2EYTUHN76ITZQ,B000MNP2XU,4.0,I have previous cd and like his music so I am ...,I have previous cd and like his music so I am ...,1408310605000
82158,AHPHH5CS7IJBCH2XI7FZGWHJDCIQ,B07ZVNNZPT,5.0,Late XMas Gift To Myself,Arrived on time. Love it!,1581912780074
82159,AHPHH5CS7IJBCH2XI7FZGWHJDCIQ,B015G0BLBA,3.0,I love the cover but it's not the Japanese ver...,I love the cover but it's not the Japanese ver...,1503969794679
82160,AHPHH5CS7IJBCH2XI7FZGWHJDCIQ,B07ZVNNZPT,5.0,Late XMas Gift To Myself,Arrived on time. Love it!,1581912780074
82161,AHPHH5CS7IJBCH2XI7FZGWHJDCIQ,B015G0BLBA,3.0,I love the cover but it's not the Japanese ver...,I love the cover but it's not the Japanese ver...,1503969794679
