In [None]:
import polars as pl
from amazon.constants import *
N_SAMPLE = 100_000

In [None]:
def extract_brand() -> pl.Expr:
    store_clean: pl.Then = pl.when(pl.col("store").str.strip_chars() != "").then(pl.col("store"))
    details_brand: pl.Expr = pl.col("details").str.extract(r"Brand[:\s]*([^\s,;]+)", 1)
    return pl.coalesce([store_clean, details_brand, pl.lit("Unknown")]).alias("brand")


def process_category(category: str) -> None:
    lf_review: pl.LazyFrame = pl.scan_parquet(f"data/raw/{REVIEW}/{category}.parquet")
    lf_meta: pl.LazyFrame = pl.scan_parquet(f"data/raw/{META}/{category}.parquet")
    lf_review = lf_review.head(N_SAMPLE)
    lf_meta = lf_meta.head(N_SAMPLE)

    lf_review: pl.LazyFrame = lf_review.filter(pl.col("rating").is_in([1, 2, 3, 4, 5]))
    lf_review = lf_review.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)

    lf_review = lf_review.with_columns([
        pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
        (pl.col("timestamp").cast(pl.Datetime("ms")).dt.year()).alias("year")
    ])

    lf: pl.LazyFrame = lf_review.join(lf_meta, on="parent_asin", how="left")
    lf = lf.with_columns([extract_brand()])
    lf = lf.unique(subset=["user_id", "text", "asin"], keep="first")
    lf.sink_parquet(f"data/intermediate/{category}.parquet")

In [None]:
for category in TEST_CATEGORIES:
    print(f"Cleaning dataset for {category}...")
    process_category(category)

In [None]:
merged: pl.LazyFrame = pl.concat([
    pl.scan_parquet(f"data/intermediate/{category}.parquet")
    for category in TEST_CATEGORIES
])

merged = merged.unique(subset=["user_id", "text", "asin"], keep="first")
merged.sink_parquet("data/processed/amazon-2023.parquet")