In [None]:
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk
from bigdata_a3_utils import download_all_amazon_reviews, load_compressed_dataset
import polars as pl

In [None]:
beauty_review: Dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_All_Beauty",
    split="full",
    trust_remote_code=True,
)

beauty_meta: Dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_meta_All_Beauty",
    split="full",
    trust_remote_code=True,
)

In [None]:
beauty_review.to_parquet("data/raw/review/amazon_review_beauty.parquet")
beauty_meta.to_parquet("data/raw/meta/amazon_meta_beauty.parquet")

In [None]:
df_review: pl.DataFrame = pl.read_parquet("data/raw/review/amazon_review_beauty.parquet")
df_meta: pl.DataFrame = pl.read_parquet("data/raw/meta/amazon_meta_beauty.parquet")

In [20]:
df: pl.DataFrame = df_review.join(df_meta, on="parent_asin", how="left")

In [21]:
def extract_brand() -> pl.Expr:
    store_clean: pl.Then = pl.when(pl.col("store").str.strip_chars() != "").then(pl.col("store"))
    details_brand: pl.Expr = pl.col("details").str.extract(r"Brand[:\s]*([^\s,;]+)", 1)
    
    return pl.coalesce([
        store_clean,
        details_brand,
        pl.lit("Unknown")
    ]).alias("brand")

In [22]:
# Drop invalid ratings (missing or not in [1,2,3,4,5])
df: pl.DataFrame = df.filter(pl.col("rating").is_in([1, 2, 3, 4, 5]))
# Drop empty review texts
df = df.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)

df = df.with_columns([extract_brand()])

df = df.with_columns([
    pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
    (pl.col("timestamp").cast(pl.Datetime("ms")).dt.year()).alias("year")
])


df = df.unique(subset=["user_id", "text", "asin"], keep="first")

In [24]:
df.write_parquet("data/intermediate/amazon_beauty.parquet")