In [149]:
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk
from bigdata_a3_utils import download_all_amazon_reviews, load_compressed_dataset
from pathlib import Path
import polars as pl
import pandas as pd
from itertools import islice
from typing import Generator

In [150]:
# auto_review = load_dataset(
#     "McAuley-Lab/Amazon-Reviews-2023",
#     "raw_review_Automotive",
#     split="full",
#     trust_remote_code=True,
# )

# auto_meta = load_dataset(
#     "McAuley-Lab/Amazon-Reviews-2023",
#     "raw_meta_Automotive",
#     split="full",
#     trust_remote_code=True,
# )

In [151]:
# auto_meta.to_parquet("amazon_review_meta.parquet")
# auto_review.to_parquet("amazon_review_auto.parquet")

In [152]:
df_review: pl.DataFrame = pl.read_parquet("amazon_review_auto.parquet")
df_meta: pl.DataFrame = pl.read_parquet("amazon_meta_auto.parquet")

In [153]:
df: pl.DataFrame = df_review.join(df_meta, on="parent_asin", how="left")

In [154]:
# Drop invalid ratings (missing or not in [1,2,3,4,5])
df = df.filter(pl.col("rating").is_in([1, 2, 3, 4, 5]))

# Drop empty review texts
df = df.filter(pl.col("text").str.strip_chars().str.len_chars() > 0)

# Fill brand from store or details
df = df.with_columns([
    pl.when(pl.col("store").is_not_null() & (pl.col("store") != ""))
      .then(pl.col("store"))
      .when(pl.col("details").is_not_null())
      .then(
          pl.col("details")
          .str.extract(r"Brand[:\s]*([^\s,;]+)", 1)  # Extract brand from "Brand: XYZ"
          .fill_null("Unknown")
      )
      .otherwise(pl.lit("Unknown"))
      .alias("brand")
])

df = df.with_columns([
    pl.col("text").str.count_matches(r"\b\w+\b").alias("review_length"),
    (pl.col("timestamp").cast(pl.Datetime("ms")).dt.year()).alias("year")
])


df = df.unique(subset=["user_id", "text", "asin"])
df

rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,title_right,average_rating,rating_number,features,description,price,images_right,videos,store,categories,details,bought_together,subtitle,author,brand,review_length,year
f64,str,str,list[struct[4]],str,str,str,i64,i64,bool,str,str,f64,i64,list[str],list[str],str,struct[4],struct[3],str,list[str],str,str,str,str,str,u32,i32
5.0,"""EASY TO LOAD""","""NICE PERKS FOR THE UPLOAD""",[],"""B00IX1I3G6""","""B00IX1I3G6""","""AHVC24CVCY5RW6GDXZ5TDDT4KS4A""",1547660847272,0,true,,"""Amazon Reload""",4.7,143309,"[""Add funds to your Amazon Gift Card balance, then use your balance to manage how much you spend while shopping."", ""Save up for a purchase by adding funds to your Gift Card balance."", … ""Amazon Reload purchases are not refundable or redeemable for cash, except as required by law.""]",[],"""None""","{[null, null],[""https://m.media-amazon.com/images/I/41rPcyZJ3nL._AC_.jpg"", ""https://m.media-amazon.com/images/I/41sQl2wriGL._AC_.jpg""],[""https://m.media-amazon.com/images/I/41rPcyZJ3nL._AC_SR38,50_.jpg"", ""https://m.media-amazon.com/images/I/41sQl2wriGL._AC_SR38,50_.jpg""],[""MAIN"", ""PT01""]}","{[""Very easy to do!!""],[""https://www.amazon.com/vdp/0a8f8f3b9f5b409896b362a927a00de1?ref=dp_vse_rvc_0""],[""AGVDC7ODZHCRVYN5D3MC42IBPVJA""]}","""Amazon""","[""Gift Cards"", ""Gift Card Recipients"", ""For Him""]","""{""Item model number"": ""Variabl…",,,,"""Amazon""",5,2019
5.0,"""I Added money to the Amazon g…","""Added$$""",[],"""B00IX1I3G6""","""B00IX1I3G6""","""AG2I6ZK7DE3AA42HOILXPBHQMEDQ""",1548630626249,0,true,,"""Amazon Reload""",4.7,143309,"[""Add funds to your Amazon Gift Card balance, then use your balance to manage how much you spend while shopping."", ""Save up for a purchase by adding funds to your Gift Card balance."", … ""Amazon Reload purchases are not refundable or redeemable for cash, except as required by law.""]",[],"""None""","{[null, null],[""https://m.media-amazon.com/images/I/41rPcyZJ3nL._AC_.jpg"", ""https://m.media-amazon.com/images/I/41sQl2wriGL._AC_.jpg""],[""https://m.media-amazon.com/images/I/41rPcyZJ3nL._AC_SR38,50_.jpg"", ""https://m.media-amazon.com/images/I/41sQl2wriGL._AC_SR38,50_.jpg""],[""MAIN"", ""PT01""]}","{[""Very easy to do!!""],[""https://www.amazon.com/vdp/0a8f8f3b9f5b409896b362a927a00de1?ref=dp_vse_rvc_0""],[""AGVDC7ODZHCRVYN5D3MC42IBPVJA""]}","""Amazon""","[""Gift Cards"", ""Gift Card Recipients"", ""For Him""]","""{""Item model number"": ""Variabl…",,,,"""Amazon""",1,2019
5.0,"""Great""","""christmas gift""",[],"""B01K8RJDEI""","""B00ADR2LV6""","""AFJNPZQK3NYIJ232JJ52U6NXWS7Q""",1613665005711,0,true,"""Gift Cards""","""Amazon.com Gift Card in a Holi…",4.9,185606,"[""Gift Card is affixed inside a box"", ""Gift amount may not be printed on Gift Cards"", … ""Customized gift message, if chosen at check-out, only appears on packing slip and not on the actual gift card or carrier""]",[],"""25.0""","{[""https://m.media-amazon.com/images/I/81Axm9gA9bL._SL1500_.jpg"", ""https://m.media-amazon.com/images/I/81bjGbl0U3L._SL1500_.jpg"", … null],[""https://m.media-amazon.com/images/I/41S-n8ICtKL.jpg"", ""https://m.media-amazon.com/images/I/41KGkEragiL.jpg"", … ""https://m.media-amazon.com/images/I/31oYO5JWaEL.jpg""],[""https://m.media-amazon.com/images/I/41S-n8ICtKL._SX38_SY50_CR,0,0,38,50_.jpg"", ""https://m.media-amazon.com/images/I/41KGkEragiL._SX38_SY50_CR,0,0,38,50_.jpg"", … ""https://m.media-amazon.com/images/I/31oYO5JWaEL._SX38_SY50_CR,0,0,38,50_.jpg""],[""MAIN"", ""PT01"", … ""PT12""]}","{[""Amazon Gift Box Gift Cards Perfect for any occasion "", ""Holiday Gift Card That Looks Good"", … ""Cool Christmas Gift Card Box""],[""https://www.amazon.com/vdp/04743ba4c0af4f0e8b1d07c6399d65be?ref=dp_vse_rvc_0"", ""https://www.amazon.com/vdp/0547a6e81075493aa539f174f7f90b99?ref=dp_vse_rvc_1"", … ""https://www.amazon.com/vdp/084b1b5864a9416692da36ebe0c45270?ref=dp_vse_rvc_7""],[""/shop/purplepowerwashllc"", ""/shop/becomingthefords"", … ""/shop/becomingthefords""]}","""Amazon""","[""Gift Cards"", ""Occasions"", ""Chanukah""]","""{""Package Dimensions"": ""8.19 x…",,,,"""Amazon""",2,2021
5.0,"""Everyone loves getting these a…","""This made a cute gift to give.…",[],"""B01E4QQAWW""","""B018F4M89S""","""AH4JDSH7VHPWQVTBIVA64SVH6WJQ""",1549644796137,0,true,"""Gift Cards""","""Amazon.com Gift Card in a Reve…",4.9,80974,"[""Gift Card is affixed inside a reveal product"", ""Gift Card has no fees and no expiration date"", … ""Customized gift message, if chosen at check-out, only appears on packing slip and not on the actual gift card or carrier""]","[""Amazon.com Gift Cards are the perfect way to give them exactly what they're hoping for - even if you don't know what it is. Amazon.com Gift Cards are redeemable for millions of items across Amazon.com. Item delivered is a single physical Amazon.com Gift Card nested inside or with a free gift accessory.""]","""20.0""","{[""https://m.media-amazon.com/images/I/71tuXbbE0TL._SL1500_.jpg"", ""https://m.media-amazon.com/images/I/71Nqlm+ZyqL._SL1236_.jpg"", … null],[""https://m.media-amazon.com/images/I/515H0HfsHxL.jpg"", ""https://m.media-amazon.com/images/I/51PlQZnaYZL.jpg"", … ""https://m.media-amazon.com/images/I/31P20WACP9L.jpg""],[""https://m.media-amazon.com/images/I/515H0HfsHxL._SX38_SY50_CR,0,0,38,50_.jpg"", ""https://m.media-amazon.com/images/I/51PlQZnaYZL._SX38_SY50_CR,0,0,38,50_.jpg"", … ""https://m.media-amazon.com/images/I/31P20WACP9L._SX38_SY50_CR,0,0,38,50_.jpg""],[""MAIN"", ""PT01"", … ""PT12""]}","{[""Slide up to reveal card"", ""Damaged presentation box "", … ""Happy Birthday Granddaughter ( Erin)""],[""https://www.amazon.com/vdp/974928ef005b44ab8c936b4c3c804114?ref=dp_vse_rvc_0"", ""https://www.amazon.com/vdp/e1d6ef73371a40259c0356ed56f8a206?ref=dp_vse_rvc_1"", … ""https://www.amazon.com/vdp/0617fcf2d1584599bf60cf09cb3e6e03?ref=dp_vse_rvc_6""],[""AHDUDAEJT7B5ZB6TRIZLM4RQX6FA"", ""AE2GM6GRDXUSYCFDK4XOQJLE7UKQ"", … ""AGLPQAOHVH5XHPWIVBMITF5DUOXA""]}","""Amazon""","[""Gift Cards"", ""Occasions"", ""Christmas""]","""{""Package Dimensions"": ""4.37 x…",,,,"""Amazon""",56,2019
5.0,"""Gift cards""","""These were used for Christmas …",[],"""B00FTGTM5E""","""B00FTGTIOE""","""AGQIEB52IPTI7LUBCDXJPLCOVZSQ""",1486482170000,1,true,"""Gift Cards""","""Starbucks $10 Gift Cards (4-Pa…",4.9,13066,"[""This item contains 4 separate $10 plastic gift cards"", ""Starbucks Cards redeemable at most SB locations"", … ""No returns and no refunds on gift cards.""]",[],"""40.0""","{[null],[""https://m.media-amazon.com/images/I/41g1u7SJubL.jpg""],[""https://m.media-amazon.com/images/I/41g1u7SJubL._SX38_SY50_CR,0,0,38,50_.jpg""],[""MAIN""]}","{[""Unactivated 4pk. Starbucks cards"", ""Starbucks Card is perfect for anyone ."", ""Best Way to Redeem Starbucks Gift Card""],[""https://www.amazon.com/vdp/0dac88ab8f36441ba06fcbd0e1914291?ref=dp_vse_rvc_0"", ""https://www.amazon.com/vdp/07e405ee5b8c42ab9788d4025f50bb72?ref=dp_vse_rvc_1"", ""https://www.amazon.com/vdp/0f5c965adbbf48abaa3e3789fffc687c?ref=dp_vse_rvc_2""],[""AFQKTSVNC7SC2BJXAKD4O5SPVIGQ"", ""/shop/youdidntknow"", ""/shop/leontruong""]}","""Starbucks""","[""Gift Cards"", ""Gift Card Categories"", ""Grocery, Gourmet & Floral""]","""{""Package Dimensions"": ""5.16 x…",,,,"""Starbucks""",36,2017
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
5.0,"""Date night""","""Purchased as a gift, pared wit…",[],"""B00S5J8456""","""B00BXLTR6S""","""AEZETWLSVD4T7KBIG2MTPASEPFZA""",1545073315665,0,true,"""Gift Cards""","""Fandango Gift Card""",4.8,2292,"[""Redemption: Online"", ""Available for use at your favorite theater, thousands of theaters nationwide"", … ""No returns and no refunds on gift cards""]",[],"""50.0""","{[null],[""https://m.media-amazon.com/images/I/41GrT9yMOoL.jpg""],[""https://m.media-amazon.com/images/I/41GrT9yMOoL._SX38_SY50_CR,0,0,38,50_.jpg""],[""MAIN""]}","{[""Fandango Gift Card""],[""https://www.amazon.com/vdp/03f58c3b27764be3a4e76e7e45e80af7?ref=dp_vse_rvc_0""],[""/shop/influencer-dcb4065c""]}","""Fandango""","[""Gift Cards"", ""Gift Card Categories"", ""Books, Movies & Music""]","""{""Package Dimensions"": ""5.2 x …",,,,"""Fandango""",13,2018
5.0,"""Gift card""","""Granddaughter loved it""",[],"""B01C9MW8Z6""","""B00JFBLZ90""","""AGUCXNRYT46NMKAAIWKWYWSZOOSQ""",1604175847079,0,true,"""Gift Cards""","""Amazon.com Gift Card in a Birt…",4.9,62550,"[""Gift Card is nested inside a specialty gift box"", ""Gift Card has no fees and no expiration date"", … ""Customized gift message, if chosen at check-out, only appears on packing slip and not on the actual gift card or carrier""]","[""Amazon.com Gift Cards are the perfect way to give someone exactly what they're hoping for. Recipients can choose from millions of items storewide. Amazon.ca Gift Cards never expire, so they can buy something immediately or wait for that sale of a lifetime.""]","""25.0""","{[""https://m.media-amazon.com/images/I/812v4i0vRdL._SL1500_.jpg"", ""https://m.media-amazon.com/images/I/81CeBhMlOcL._SL1500_.jpg"", … null],[""https://m.media-amazon.com/images/I/4108EUWUG1L.jpg"", ""https://m.media-amazon.com/images/I/41Z8fh84OzL.jpg"", … ""https://m.media-amazon.com/images/I/31UajW-NuKL.jpg""],[""https://m.media-amazon.com/images/I/4108EUWUG1L._SX38_SY50_CR,0,0,38,50_.jpg"", ""https://m.media-amazon.com/images/I/41Z8fh84OzL._SX38_SY50_CR,0,0,38,50_.jpg"", … ""https://m.media-amazon.com/images/I/31UajW-NuKL._SX38_SY50_CR,0,0,38,50_.jpg""],[""MAIN"", ""PT01"", … ""PT12""]}","{[""It came very damaged and I was disappointed there wasn’t an option to get a new tin delivered."", ""Dissatisfied "", … ""Perfect last minute gift!""],[""https://www.amazon.com/vdp/03ec2e0049834e999e321de19afad614?ref=dp_vse_rvc_0"", ""https://www.amazon.com/vdp/7c0bf55249db4bab9eb7a4e43c015299?ref=dp_vse_rvc_1"", … ""https://www.amazon.com/vdp/0a25feb8f4b842fcaafdb7a0929a59f1?ref=dp_vse_rvc_5""],[""AH6RXR3PSVG4QTECLHQOZECP3R5A"", ""AGISMYD5W2ZIMYSDWH2CLXHGDIEA"", … ""AHDIBGYYJQCPG3IRJBUJZZDHTHTA""]}","""Amazon""","[""Gift Cards"", ""Occasions"", ""Birthday""]","""{""Package Dimensions"": ""4.57 x…",,,,"""Amazon""",3,2020
5.0,"""Always a hit""","""Got for the holidays. Starbuc…",[],"""B07ZS9XR5D""","""B07ZS9XR5D""","""AEXM2KQVSJKPFKRHPAU5ICIZIFRQ""",1579146871258,0,true,,"""Starbucks Red Siren Gift Card …",4.9,720,[],"[""Since 1971, Starbucks Coffee Company has been committed to ethically sourcing and roasting the highest quality arabica coffee in the world. Today, with stores around the globe, the company is the premier roaster and retailer of specialty coffee in the world. To share in the experience, please visit us in our stores or online at starbucks.com""]","""None""","{[null, null, null],[""https://m.media-amazon.com/images/I/51h9nXvTTcL.jpg"", ""https://m.media-amazon.com/images/I/51Mbe44JibL.jpg"", ""https://m.media-amazon.com/images/I/5134wNaccaL.jpg""],[""https://m.media-amazon.com/images/I/51h9nXvTTcL._SX38_SY50_CR,0,0,38,50_.jpg"", ""https://m.media-amazon.com/images/I/51Mbe44JibL._SX38_SY50_CR,0,0,38,50_.jpg"", ""https://m.media-amazon.com/images/I/5134wNaccaL._SX38_SY50_CR,0,0,38,50_.jpg""],[""MAIN"", ""PT01"", ""PT02""]}","{[],[],[]}","""Starbucks""","[""Gift Cards"", ""Gift Card Categories"", ""Restaurants""]","""{""Date First Available"": ""Nove…",,,,"""Starbucks""",21,2020
5.0,"""Personal Use""","""Just what I needed for going t…",[],"""B00MOWEHIS""","""B00MOWEFWQ""","""AHUO2IJ6ZTQGKF4ZGLSPTZNOREZQ""",1568165119173,0,true,"""Gift Cards""","""Steak 'n Shake Gift Cards, Mul…",4.9,273,"[""Contains 4 plastic gift cards in your choice of design."", ""The perfect gift is easy and convenient to give. Give the gift of Real-Steak Steakburgers, Real-Milk Milk Shakes, and everything else on our full menu. Our gift cards can be used at any Steak 'n Shake anytime."", … ""No returns and no refunds on gift cards.""]","[""The perfect gift is easy and convenient to give. Give the gift of Real-Steak Steakburgers, Real-Milk Milk Shakes, and everything else on our full menu. Our gift cards can be used at any Steak ’n Shake anytime. Steak ’n Shake is a Classic American brand with nearly 500 locations throughout the United States. Good at any Steak 'n Shake restaurant.""]","""40.0""","{[null],[""https://m.media-amazon.com/images/I/51aA536PwML.jpg""],[""https://m.media-amazon.com/images/I/51aA536PwML._SX38_SY50_CR,0,0,38,50_.jpg""],[""MAIN""]}","{[],[],[]}","""Steak 'n Shake""","[""Gift Cards"", ""Gift Card Categories"", ""Restaurants""]","""{""Product Dimensions"": ""6.06 x…",,,,"""Steak 'n Shake""",19,2019


In [155]:
# auto_review: DatasetDict = load_dataset(
#     "McAuley-Lab/Amazon-Reviews-2023",
#     "raw_review_Automotive",
#     trust_remote_code=True,
# )

# auto_meta: DatasetDict = load_dataset(
#     "McAuley-Lab/Amazon-Reviews-2023",
#     "raw_meta_Automotive",
#     trust_remote_code=True,
#     streaming=True,
# )

In [156]:
# review_dataset: Dataset = auto_review["full"]
# meta_dataset: Dataset = auto_meta["full"]

In [157]:
# # def stream_to_pandas(dataset: Generator, batch_size=100_000, total=1_000_000) -> pd.DataFrame:
# #     batches: list[pd.DataFrame] = []
# #     it: Generator = iter(dataset)
    
# #     for _ in range(0, total, batch_size):
# #         chunk = list(islice(it, batch_size))
# #         if not chunk:
# #             break
# #         batches.append(pd.DataFrame(chunk))
    
# #     return pd.concat(batches, ignore_index=True)


# def stream_to_pandas(dataset: Generator, batch_size=100_000) -> pd.DataFrame:
#     batches: list[pd.DataFrame] = []
#     it: Generator = iter(dataset)
    
#     while True:
#         # Extract a batch of data
#         chunk = list(islice(it, batch_size))
#         if not chunk:  # Break the loop when the iterator is exhausted
#             break
#         batches.append(pd.DataFrame(chunk))
    
#     # Concatenate all batches into a single DataFrame
#     return pd.concat(batches, ignore_index=True)

In [158]:
# df_review: pd.DataFrame = stream_to_pandas(review_dataset, batch_size=100_000)
# df_meta: pd.DataFrame = stream_to_pandas(meta_dataset, batch_size=100_0000)
# # df = pl.DataFrame(list(islice(meta_dataset, 10)))

In [159]:
# def extract_brand(row) -> str:
#     if row["store"]:
#         return row["store"]  
#     elif row["details"]:
#         for word in row["details"].split():
#             if "Brand" in word:
#                 return word
#     return "Unknown"

In [160]:
# merged_df: pd.DataFrame = pd.merge(df_review, df_meta, on="parent_asin", how="left")
# merged_df = merged_df[
#     merged_df["rating"].notnull() & merged_df["rating"].isin([1, 2, 3, 4, 5])
# ]
# merged_df = merged_df[merged_df["text"].str.strip().str.len() != 0]
# merged_df.loc[:, "brand"] = merged_df.apply(extract_brand, axis=1)
# merged_df['review_length'] = merged_df['text'].apply(lambda x: len(x.split()))
# merged_df['year'] = pd.to_datetime(merged_df['timestamp'], unit='ms').dt.year
# merged_df = merged_df.drop_duplicates(subset=["user_id", "text", "asin"], keep="first")

# df = pl.from_pandas(merged_df)
# df

In [161]:
# df.shape