In [1]:
from datasets import load_dataset, Dataset
from pathlib import Path
import polars as pl
from bigdata_a3_utils import VALID_CATEGORIES

In [None]:
VALID_CATEGORIES: list[str] = ["Gift_Cards", "Subscription_Boxes", "Magazine_Subscriptions"]

In [3]:
PATHS: list[str] = ["data/raw/review", "data/raw/meta", "data/intermediate"]

In [4]:
def create_dir() -> None:
    for path in PATHS:
        Path(path).mkdir(parents=True, exist_ok=True)

In [5]:
def load_metadata(category: str) -> Dataset:
    return load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        f"raw_meta_{category}",
        split="full",
        trust_remote_code=True,
    )

In [6]:
def load_reviews(category: str) -> Dataset:
    return load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        f"raw_review_{category}",
        split="full",
        trust_remote_code=True,
    )

In [7]:
def download_datasets() -> None:
    for category in VALID_CATEGORIES:
        print(f"Loading {category}...\n")
        load_metadata(category).to_parquet(f"data/raw/meta/{category}.parquet")
        load_reviews(category).to_parquet(f"data/raw/review/{category}.parquet")

In [8]:
def extract_brand() -> pl.Expr:
    store_clean: pl.Then = pl.when(pl.col("store").str.strip_chars() != "").then(
        pl.col("store")
    )
    details_brand: pl.Expr = pl.col("details").str.extract(r"Brand[:\s]*([^\s,;]+)", 1)
    return pl.coalesce([store_clean, details_brand, pl.lit("Unknown")]).alias("brand")

In [9]:
def clean_reviews(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.filter(
            pl.col("rating").is_in([1, 2, 3, 4, 5])
            & (pl.col("text").str.strip_chars().str.len_chars() > 0)
        )
        .with_columns([extract_brand()])
        .unique(subset=["user_id", "text", "asin"], keep="first")
        .with_columns(
            pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
            pl.col("timestamp").cast(pl.Datetime("ms")).dt.year().alias("year"),
        )
    )

In [10]:
def clean_datasets() -> None:
    for category in VALID_CATEGORIES:
        print(f"\nLoading dataset for {category}...")
        df_review: pl.DataFrame = pl.read_parquet(f"data/raw/review/{category}.parquet")
        df_meta: pl.DataFrame = pl.read_parquet(f"data/raw/meta/{category}.parquet")

        print("Merging on review and metadata...")
        df: pl.DataFrame = df_review.join(df_meta, on="parent_asin", how="left")

        print("Cleaning data...")
        df = clean_reviews(df)

        print(f"Writing to data/intermediate/{category}.parquet...")
        df.write_parquet(f"data/intermediate/{category}.parquet")

In [11]:
create_dir()

In [12]:
download_datasets()

Loading Gift_Cards...



meta_Gift_Cards.jsonl:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Generating full split:   0%|          | 0/1137 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Gift_Cards.jsonl:   0%|          | 0.00/50.2M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/153 [00:00<?, ?ba/s]

Loading Subscription_Boxes...



meta_Subscription_Boxes.jsonl:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Subscription_Boxes.jsonl:   0%|          | 0.00/8.95M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Loading Magazine_Subscriptions...



meta_Magazine_Subscriptions.jsonl:   0%|          | 0.00/4.10M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Magazine_Subscriptions.jsonl:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/72 [00:00<?, ?ba/s]

In [13]:
clean_datasets()


Loading dataset for Gift_Cards...
Merging on review and metadata...
Cleaning data...
Writing to data/intermediate/Gift_Cards.parquet...

Loading dataset for Subscription_Boxes...
Merging on review and metadata...
Cleaning data...
Writing to data/intermediate/Subscription_Boxes.parquet...

Loading dataset for Magazine_Subscriptions...
Merging on review and metadata...
Cleaning data...
Writing to data/intermediate/Magazine_Subscriptions.parquet...
