In [None]:
import polars as pl
from amazon.constants import *
import gc
import os

# N_SAMPLE = 500_000

In [None]:
storage: list[str] = [
    "title_right",
    "images_right",
    "description",
    "features",
    "videos",
    "subtitle",
    "bought_together",
    "author",
    "images",
    "title",
    "average_rating",
    "rating_number",
    "categories",
    "store",
]
review_cols = [
    "rating",
    "text",
    "asin",
    "parent_asin",
    "user_id",
    "helpful_vote",
    "verified_purchase",
    "timestamp"
]
meta_cols = ["main_category", "price", "store", "details", "parent_asin"]

In [None]:
def extract_brand() -> pl.Expr:
    store_clean: pl.Then = pl.when(pl.col("store").str.strip_chars() != "").then(pl.col("store"))
    details_brand: pl.Expr = pl.col("details").str.extract(r"Brand[:\s\-]*([A-Za-z0-9&\s]+)", 1)
    return pl.coalesce([store_clean, details_brand, pl.lit("Unknown")]).alias("brand")


def process_category(category: str) -> None:
    path: str = f"{INTERMEDIATE}/{category}.parquet"
    if os.path.exists(path):
        print(f"File already exists: {path}. Skipping.")
        return
    lf_review: pl.LazyFrame = pl.scan_parquet(f"{RAW}/{REVIEW}/{category}.parquet").select(review_cols)
    lf_meta: pl.LazyFrame = pl.scan_parquet(f"{RAW}/{META}/{category}.parquet").select(meta_cols)
    lf_review: pl.LazyFrame = lf_review.filter(pl.col("rating").is_in([1, 2, 3, 4, 5]))
    lf_review = lf_review.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)

    lf_review = lf_review.with_columns([
        pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
        (pl.col("timestamp").cast(pl.Datetime("ms")).dt.year()).alias("year")
    ])
    lf_review=lf_review.drop("timestamp")
    lf_meta = lf_meta.with_columns([extract_brand()])
    lf_meta = lf_meta.drop(["details", "store"]) # delete ltr

    lf: pl.LazyFrame = lf_review.join(lf_meta, on="parent_asin", how="inner")

    lf = lf.group_by(
        ["user_id", "text", "asin"],
        maintain_order=True
    ).agg([
        pl.all().first()
    ])
   

    lf.sink_parquet(path, engine="streaming")

In [None]:
# def extract_brand() -> pl.Expr:
#     store_clean: pl.Then = pl.when(pl.col("store").str.strip_chars() != "").then(pl.col("store"))
#     details_brand: pl.Expr = pl.col("details").str.extract(r"Brand[:\s\-]*([A-Za-z0-9&\s]+)", 1)
#     return pl.coalesce([store_clean, details_brand, pl.lit("Unknown")]).alias("brand")


# def process_category(category: str) -> None:
#     path: str = f"{INTERMEDIATE}/{category}.parquet"
#     if os.path.exists(path):
#         print(f"File already exists: {path}. Skipping.")
#         return
#     lf_review: pl.LazyFrame = pl.scan_parquet(f"{RAW}/{REVIEW}/{category}.parquet")
#     lf_meta: pl.LazyFrame = pl.scan_parquet(f"{RAW}/{META}/{category}.parquet")
#     # lf_review = lf_review.head(N_SAMPLE)
#     # lf_meta = lf_meta.head(N_SAMPLE)

#     # a) Merge on parent asin
#     lf: pl.LazyFrame = lf_review.join(lf_meta, on="parent_asin", how="inner")
#     # lf = lf.drop(["title_right", "images_right"])
#     lf = lf.drop(storage)

#     # b) Handle Invalid / Missing Values
#     lf: pl.LazyFrame = lf.filter(pl.col("rating").is_in([1, 2, 3, 4, 5]))
#     lf = lf.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)
#     lf = lf.with_columns([extract_brand()])
#     lf = lf.drop("details") # delete ltr
    
#     # c) Remove Duplicates
#     lf = lf.group_by(
#         ["user_id", "text", "asin"],
#         maintain_order=True
#     ).agg([
#         pl.all().first()
#     ])
    
   
#     # d) Derived Columns:
#     lf = lf.with_columns([
#         pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
#         # (pl.col("timestamp").cast(pl.Datetime("ms")).dt.year()).alias("year")
#     ])

#     lf.sink_parquet(path, engine="streaming", row_group_size=100)

In [None]:
for category in ALL_CATEGORIES:
    print(f"Cleaning dataset for {category}...")
    process_category(category)
    gc.collect()

In [None]:
# (23_632_810, 1)
# (23_632_810, 1)
# (693_547, 1)
# pl.scan_parquet("data/intermediate/All_Beauty.parquet").select("rating").collect().shape

In [None]:
# 502_984_947
merged: pl.LazyFrame = pl.scan_parquet(f"{INTERMEDIATE}/*.parquet")
# merged = merged.unique(subset=["user_id", "text", "asin"], keep="first")
merged.sink_parquet("data/processed/amazon-2023.parquet", engine="streaming")

In [None]:
merged.select(merged.columns[2]).collect(engine="streaming").shape