In [None]:
import polars as pl
from amazon.constants import *
import gc

N_SAMPLE = 500_000

In [None]:
def extract_brand() -> pl.Expr:
    store_clean: pl.Then = pl.when(pl.col("store").str.strip_chars() != "").then(pl.col("store"))
    details_brand: pl.Expr = pl.col("details").str.extract(r"Brand[:\s\-]*([A-Za-z0-9&\s]+)", 1)
    return pl.coalesce([store_clean, details_brand, pl.lit("Unknown")]).alias("brand")


def process_category(category: str) -> None:
    lf_review: pl.LazyFrame = pl.scan_parquet(f"{RAW}/{REVIEW}/{category}.parquet")
    lf_meta: pl.LazyFrame = pl.scan_parquet(f"{RAW}/{META}/{category}.parquet")

    lf_review = lf_review.head(N_SAMPLE)
    lf_meta = lf_meta.head(N_SAMPLE)

    # a) Merge on parent asin
    lf: pl.LazyFrame = lf_review.join(lf_meta, on="parent_asin", how="inner")
    
    # b) Handle Invalid / Missing Values
    lf: pl.LazyFrame = lf.filter(pl.col("rating").is_in([1, 2, 3, 4, 5]))
    lf = lf.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)
    lf = lf.with_columns([extract_brand()])
    
    # c) Remove Duplicates
    lf = lf.unique(subset=["user_id", "text", "asin"], keep="first")
   
    # d) Derived Columns:
    lf = lf.with_columns([
        pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
        (pl.col("timestamp").cast(pl.Datetime("ms")).dt.year()).alias("year")
    ])
    lf.sink_parquet(f"{INTERMEDIATE}/{category}.parquet", engine="streaming")

In [None]:
for category in ALL_CATEGORIES[:15]:
    print(f"Cleaning dataset for {category}...")
    process_category(category)

gc.collect()

In [None]:
# merged: pl.LazyFrame = pl.concat([
#     pl.scan_parquet(f"{INTERMEDIATE}/{category}.parquet")
#     for category in ALL_CATEGORIES[:15]
# ])

merged: pl.LazyFrame = pl.scan_parquet(f"{INTERMEDIATE}/*.parquet")
# merged = merged.unique(subset=["user_id", "text", "asin"], keep="first")
merged.sink_parquet("data/processed/amazon-2023.parquet", engine="streaming")