In [None]:
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk
from bigdata_a3_utils import download_all_amazon_reviews, load_compressed_dataset
import polars as pl

In [None]:
beauty_review: Dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_All_Beauty",
    split="full",
    trust_remote_code=True,
)

beauty_meta: Dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_meta_All_Beauty",
    split="full",
    trust_remote_code=True,
)

In [None]:
beauty_review.to_parquet("amazon_review_beauty.parquet")
beauty_meta.to_parquet("amazon_meta_beauty.parquet")

In [None]:
df_review: pl.DataFrame = pl.read_parquet("amazon_review_beauty.parquet")
df_meta: pl.DataFrame = pl.read_parquet("amazon_meta_beauty.parquet")

In [20]:
df: pl.DataFrame = df_review.join(df_meta, on="parent_asin", how="left")

In [21]:
def extract_brand() -> pl.Expr:
    store_clean: pl.Then = pl.when(pl.col("store").str.strip_chars() != "").then(pl.col("store"))
    details_brand: pl.Expr = pl.col("details").str.extract(r"Brand[:\s]*([^\s,;]+)", 1)
    
    return pl.coalesce([
        store_clean,
        details_brand,
        pl.lit("Unknown")
    ]).alias("brand")

In [22]:
# Drop invalid ratings (missing or not in [1,2,3,4,5])
df: pl.DataFrame = df.filter(pl.col("rating").is_in([1, 2, 3, 4, 5]))
# Drop empty review texts
df = df.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)

df = df.with_columns([extract_brand()])

df = df.with_columns([
    pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
    (pl.col("timestamp").cast(pl.Datetime("ms")).dt.year()).alias("year")
])


df = df.unique(subset=["user_id", "text", "asin"], keep="first")

In [23]:
df

rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,title_right,average_rating,rating_number,features,description,price,images_right,videos,store,categories,details,bought_together,subtitle,author,brand,review_length,year
f64,str,str,list[struct[4]],str,str,str,i64,i64,bool,str,str,f64,i64,list[str],list[str],str,struct[4],struct[3],str,list[str],str,str,str,str,str,u32,i32
5.0,"""Buy them""","""The are the best hair ties. Ne…",[],"""B07V5CW5CG""","""B07V5CW5CG""","""AGC73F2KHTUOZWUDOZYGTA5G6Z2Q""",1580159609883,0,true,"""All Beauty""","""WillingTee 30pcs Metal Color S…",4.2,34,[],[],"""None""","{[""https://m.media-amazon.com/images/I/71tV2goA1QL._SL1200_.jpg"", ""https://m.media-amazon.com/images/I/71oK2Y386iL._SL1200_.jpg""],[""https://m.media-amazon.com/images/I/51NFugypGcL.jpg"", ""https://m.media-amazon.com/images/I/51+nJXXbQ4L.jpg""],[""https://m.media-amazon.com/images/I/51NFugypGcL._SS40_.jpg"", ""https://m.media-amazon.com/images/I/51+nJXXbQ4L._SS40_.jpg""],[""MAIN"", ""PT01""]}","{[],[],[]}","""WillingTee""",[],"""{""Material"": ""Plastic"", ""Item …",,,,"""WillingTee""",17,2020
1.0,"""It Rusted some Implements""","""I decide as I wanted to try a …",[],"""B06XGWJJFW""","""B06XGWJJFW""","""AFMR6XY4DH2BONH375ZTFRKK3YVQ""",1600970983764,2,true,"""All Beauty""","""MOD Clean""",4.4,193,[],[],"""17.13""","{[null],[""https://m.media-amazon.com/images/I/31dH31wdMjL.jpg""],[""https://m.media-amazon.com/images/I/31dH31wdMjL._SS40_.jpg""],[""MAIN""]}","{[],[],[]}","""Artist Unknown""",[],"""{""Brand"": ""Artist Unknown"", ""I…",,,,"""Artist Unknown""",45,2020
5.0,"""OK2BUY""","""Beats Walmart price, great pro…",[],"""B00JISAUR8""","""B00JISAUR8""","""AGZTW2YLNOPFKCT4HPCYNLC74JVA""",1495754871000,0,true,"""All Beauty""","""Garnier Fructis Sleek & Shine …",4.1,64,"[""Leave-in conditioning cream for up to 3 day sleek"", ""Intensely smooth"", ""Proven strength system""]","[""Garnier Fructis Sleek & Shine Leave-In Conditioning Cream, 10.2 oz, 2 pk""]","""8.98""","{[""https://m.media-amazon.com/images/I/71sxKqdPecL._SL1500_.jpg"", ""https://m.media-amazon.com/images/I/71WpuymxM8L._SL1500_.jpg"", … ""https://m.media-amazon.com/images/I/8133DyePyaL._SL1500_.jpg""],[""https://m.media-amazon.com/images/I/41gUHu-MCzL.jpg"", ""https://m.media-amazon.com/images/I/41NlRHsEsPL.jpg"", … ""https://m.media-amazon.com/images/I/51y971sKo+L.jpg""],[""https://m.media-amazon.com/images/I/41gUHu-MCzL._SS40_.jpg"", ""https://m.media-amazon.com/images/I/41NlRHsEsPL._SS40_.jpg"", … ""https://m.media-amazon.com/images/I/51y971sKo+L._SS40_.jpg""],[""MAIN"", ""PT01"", … ""DTLS""]}","{[""Here's The Secret To Having Beautiful Hair! MUST WATCH!"", ""Style Code Live: Redbook Garnier Integration | Pro Tips for Perfect Hair | Look 2""],[""https://www.amazon.com/vdp/0024af0ae1234a3fac9ffd7460ead76b?ref=dp_vse_rvc_0"", ""https://www.amazon.com/vdp/afdc73b298dc4ca0acb1796a6f0bd11d?ref=dp_vse_rvc_1""],[""/shop/influencer-98d451b5"", """"]}","""Garnier""",[],"""{""Brand"": ""Garnier"", ""Item For…",,,,"""Garnier""",5,2017
5.0,"""I’m in love & Thank You""","""WOW! I NEVER write reviews, bu…","[{""IMAGE"",""https://images-na.ssl-images-amazon.com/images/I/71UDVzQRTXL.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71UDVzQRTXL._SL800_.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71UDVzQRTXL._SL256_.jpg""}, {""IMAGE"",""https://images-na.ssl-images-amazon.com/images/I/71hVCiPD0FL.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71hVCiPD0FL._SL800_.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71hVCiPD0FL._SL256_.jpg""}, {""IMAGE"",""https://images-na.ssl-images-amazon.com/images/I/71unZzeJlGL.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71unZzeJlGL._SL800_.jpg"",""https://images-na.ssl-images-amazon.com/images/I/71unZzeJlGL._SL256_.jpg""}]","""B083NVC5RB""","""B083NVC5RB""","""AHREBJPMMF3WUWOUL6OEBCAPTXYA""",1587313701297,10,true,"""All Beauty""","""22 inch 7 packs Passion Twist …",4.3,71,[],[],"""None""","{[""https://m.media-amazon.com/images/I/71lLNP5ZIoL._SL1001_.jpg"", ""https://m.media-amazon.com/images/I/71TBJvLEcTL._SL1001_.jpg"", … ""https://m.media-amazon.com/images/I/71VekwopC4L._SL1001_.jpg""],[""https://m.media-amazon.com/images/I/51V1HApt8wL.jpg"", ""https://m.media-amazon.com/images/I/51v4fn4--CL.jpg"", … ""https://m.media-amazon.com/images/I/51Y3r+rz3PL.jpg""],[""https://m.media-amazon.com/images/I/51V1HApt8wL._SS40_.jpg"", ""https://m.media-amazon.com/images/I/51v4fn4--CL._SS40_.jpg"", … ""https://m.media-amazon.com/images/I/51Y3r+rz3PL._SS40_.jpg""],[""MAIN"", ""PT03"", … ""PT06""]}","{[],[],[]}","""ronsaen""",[],"""{""Brand"": ""ronsaen"", ""Extensio…",,,,"""ronsaen""",103,2020
5.0,"""Arrived on time = Happy Grandp…","""Christmas gift that was exactl…",[],"""B00ZJLQBEY""","""B00ZJLQBEY""","""AEWFSXI52KH2ICSIWQSM3ENL7XOA""",1454113917000,0,true,"""All Beauty""","""Maniology Super Cute Turquoise…",4.2,253,[],[],"""None""","{[""https://m.media-amazon.com/images/I/61nal243LWL._SL1500_.jpg"", ""https://m.media-amazon.com/images/I/61BGSm1pVAL._SL1500_.jpg"", … null],[""https://m.media-amazon.com/images/I/41x1RHWi7kL.jpg"", ""https://m.media-amazon.com/images/I/41gN5IEO5+L.jpg"", … ""https://m.media-amazon.com/images/I/41T8RJbRUAL.jpg""],[""https://m.media-amazon.com/images/I/41x1RHWi7kL._SS40_.jpg"", ""https://m.media-amazon.com/images/I/41gN5IEO5+L._SS40_.jpg"", … ""https://m.media-amazon.com/images/I/41T8RJbRUAL._SS40_.jpg""],[""MAIN"", ""PT01"", … ""PT05""]}","{[""Unboxing and demo of brush and dotter set""],[""https://www.amazon.com/vdp/17bc776b7ba84e23a35575bd53630ee1?ref=dp_vse_rvc_0""],[""AFC5J2XCVGZH735ARYY6UH2RX3TA""]}","""b.m.c""",[],"""{""Is Discontinued By Manufactu…",,,,"""b.m.c""",14,2016
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3.0,"""Ok""","""A little painful if you wear t…",[],"""B00KV32GNA""","""B00KV32GNA""","""AGPO67SPBXUAEXNVFXMH5HWB5V4A""",1446741727000,1,true,"""All Beauty""","""SoooKu 10 Pcs Glow Ball Tongue…",4.2,153,"[""Suitable for tongue or nipple"", ""Glow in the dark"", … ""Colors: random colors""]",[],"""9.74""","{[""https://m.media-amazon.com/images/I/61Nd9OI8EHL._SL1001_.jpg"", ""https://m.media-amazon.com/images/I/6164CJL3zJL._SL1001_.jpg"", … ""https://m.media-amazon.com/images/I/61DzC-toYQL._SL1001_.jpg""],[""https://m.media-amazon.com/images/I/41kkf99tzwL.jpg"", ""https://m.media-amazon.com/images/I/41cYRcq-BkL.jpg"", … ""https://m.media-amazon.com/images/I/41VDZ4rOySL.jpg""],[""https://m.media-amazon.com/images/I/41kkf99tzwL._SS40_.jpg"", ""https://m.media-amazon.com/images/I/41cYRcq-BkL._SS40_.jpg"", … ""https://m.media-amazon.com/images/I/41VDZ4rOySL._SS40_.jpg""],[""MAIN"", ""PT01"", … ""PT06""]}","{[],[],[]}","""SoooKu""",[],"""{""UPC"": ""769251649481"", ""Manuf…",,,,"""SoooKu""",13,2015
3.0,"""It was very nice""","""Ended up to be to long for me""",[],"""B08B4P69LK""","""B08B4P69LK""","""AHBYPIA5NNNZ6JM22DUSS33T4Z5A""",1613249401446,0,true,"""All Beauty""","""NAYOO Long Drawstring Ponytail…",3.8,36,[],[],"""None""","{[""https://m.media-amazon.com/images/I/71pblajOmlL._SL1000_.jpg"", ""https://m.media-amazon.com/images/I/713McGN6fML._SL1000_.jpg"", … ""https://m.media-amazon.com/images/I/61H52J3oFgL._SL1000_.jpg""],[""https://m.media-amazon.com/images/I/51d3ddVghWL.jpg"", ""https://m.media-amazon.com/images/I/51Mg6Jl48uL.jpg"", … ""https://m.media-amazon.com/images/I/51JiumIKClL.jpg""],[""https://m.media-amazon.com/images/I/51d3ddVghWL._SS40_.jpg"", ""https://m.media-amazon.com/images/I/51Mg6Jl48uL._SS40_.jpg"", … ""https://m.media-amazon.com/images/I/51JiumIKClL._SS40_.jpg""],[""MAIN"", ""PT01"", … ""PT06""]}","{[],[],[]}","""NAYOO""",[],"""{""Material"": ""Synthetic"", ""Hai…",,,,"""NAYOO""",8,2021
5.0,"""Just as beautiful as I expecte…","""Beautiful! It was a great an a…",[],"""B01I0SAM88""","""B01I0SAM88""","""AFRVXPXVV6LLQVTJQW6NQ6GJKTXA""",1483815834000,0,false,"""All Beauty""","""Xuanli Wood Combs Carving rose…",4.7,29,[],[],"""None""","{[""https://m.media-amazon.com/images/I/71RXJquNB5L._SL1000_.jpg"", ""https://m.media-amazon.com/images/I/71-76DOx12L._SL1000_.jpg"", … ""https://m.media-amazon.com/images/I/71YsmryxnCL._SL1000_.jpg""],[""https://m.media-amazon.com/images/I/51agVHvaE3L.jpg"", ""https://m.media-amazon.com/images/I/61Rdmowj+mL.jpg"", … ""https://m.media-amazon.com/images/I/61EYepCbFML.jpg""],[""https://m.media-amazon.com/images/I/51agVHvaE3L._SS40_.jpg"", ""https://m.media-amazon.com/images/I/61Rdmowj+mL._SS40_.jpg"", … ""https://m.media-amazon.com/images/I/61EYepCbFML._SS40_.jpg""],[""MAIN"", ""PT01"", … ""PT06""]}","{[],[],[]}","""Xuanli""",[],"""{""Material"": ""Wood"", ""Brand"": …",,,,"""Xuanli""",8,2017
4.0,"""They do the job, I am not sure…","""They do the job, I am not sure…",[],"""B000X20Y4C""","""B000X20Y4C""","""AGRPEEVB6E3NGSF6BZIJFWQPZIDQ""",1492111888000,0,true,"""All Beauty""","""Soft 'N Style Butterfly Clamps…",4.5,2031,"[""Perfect for sectioning long or short hair"", ""Package contains assorted colors"", … ""Size: 3"" wide clamps""]","[""Perfect for sectioning long or short hair. 12 clips per package. 3"" wide. Package contains assorted colors.""]","""3.8""","{[""https://m.media-amazon.com/images/I/611tApzJ+-L._SL1000_.jpg""],[""https://m.media-amazon.com/images/I/41kVcWDNxZL.jpg""],[""https://m.media-amazon.com/images/I/41kVcWDNxZL._SS40_.jpg""],[""MAIN""]}","{[],[],[]}","""Soft 'N Style""",[],"""{""Brand"": ""Soft 'N Style"", ""Ha…",,,,"""Soft 'N Style""",30,2017
