# Download, extract and store data locally

# User params

In [1]:
# Possible categories to download:
# --------------------------------
# All_Beauty
# Amazon_Fashion
# Appliances
# Arts_Crafts_and_Sewing
# Automotive
# Baby_Products
# Beauty_and_Personal_Care
# Books
# CDs_and_Vinyl
# Cell_Phones_and_Accessories
# Clothing_Shoes_and_Jewelry
# Digital_Music
# Electronics
# Gift_Cards -> not many outliers
# Grocery_and_Gourmet_Food
# Handmade_Products
# Health_and_Household
# Health_and_Personal_Care
# Home_and_Kitchen
# Industrial_and_Scientific
# Kindle_Store
# Magazine_Subscriptions
# Movies_and_TV
# Musical_Instruments
# Office_Products
# Patio_Lawn_and_Garden
# Pet_Supplies
# Software
# Sports_and_Outdoors
# Subscription_Boxes
# Tools_and_Home_Improvement
# Toys_and_Games
# Video_Games

category_to_download = "Appliances"


# For dataset library
# ---------------------
# type of data to download
# entire unprocessed data: 
# - 'raw_'
# processed data : 
# - "0core_rating_", "0core_timestamp_", "0core_timestamp_w_his_"
# - "5core_rating_", "5core_timestamp_", "5core_timestamp_w_his_"
type = 'raw_'
split = "full" # "full", "train", "valid", "test"


# Utility Functions

In [2]:
import os, sys
sys.path.append(os.path.abspath('../src'))

# other imports
from pathlib import Path

# Processing - native

In [3]:
# import os
# from src.config import BASE_PATH_DATA
# from src.utils.io.io import load_dataframe

# from src.utils.io.download import download_data
# download_data(category_to_download)

# # load data
# file_name_review = category_to_download + "_review.json"
# file_name_metadata = category_to_download + "_metadata.json"

# # read json file into dataframe
# df_metadata = load_dataframe(os.path.join(BASE_PATH_DATA, 'raw', file_name_review))
# df_review = load_dataframe(os.path.join(BASE_PATH_DATA, 'raw', file_name_metadata))


# # Merge the datasets on 'parent_asin' with suffixes for duplicate columns
# merged_df = pd.merge(df_review, df_metadata, 
#                      on="parent_asin", how="inner", suffixes=("_review", "_metadata"))

# Processing - using datasets library

In [4]:
from datasets import load_dataset

dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_" + category_to_download, trust_remote_code=True)
print(dataset_reviews[split].num_rows)
print(dataset_reviews[split][0])

dataset_medata = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_"+ category_to_download, split=split, trust_remote_code=True)
print(dataset_medata.num_rows)
print(dataset_medata[0])

Downloading data:   0%|          | 0.00/929M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

2128605
{'rating': 5.0, 'title': 'Work great', 'text': 'work great. use a new one every month', 'images': [], 'asin': 'B01N0TQ0OH', 'parent_asin': 'B01N0TQ0OH', 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'timestamp': 1519317108692, 'helpful_vote': 0, 'verified_purchase': True}


Downloading data:   0%|          | 0.00/285M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

94327
{'main_category': 'Industrial & Scientific', 'title': 'ROVSUN Ice Maker Machine Countertop, Make 44lbs Ice in 24 Hours, Compact & Portable Ice Maker with Ice Basket for Home, Office, Kitchen, Bar (Silver)', 'average_rating': 3.7, 'rating_number': 61, 'features': ['【Quick Ice Making】This countertop ice machine creates crystal & bullet shaped ice cubes; 44lbs of ice ready in 24 hours, 12 cubes made per cycle within 10 mins; you can perfectly use it for drinks, wine, smoothies, food', '【Portable Design】The weight of this ice maker is only 23.3lbs, and the small size (10.63 x14.37 x 12.87)" makes it portable. It\'s compact feature is perfect for home, office, apartments, dormitories, RVs and more, it can be placed on countertop or tabletop, plug it anywhere you like', '【Simple Operation】Adding the water tank with purified water; Power on machine and press "on/off" button to start ice making process; After 8-12 minutes, ice cube will fall off into the ice basket automatically; Take it

In [5]:
import pandas as pd
import os

# Tranform to pandas
df_review = pd.DataFrame(dataset_reviews[split])
df_metadata = pd.DataFrame(dataset_medata)

# merge
df = pd.merge(df_review, df_metadata, on="parent_asin", how="inner", suffixes=("_review", "_metadata"))

# drop duplicates not to have data leakage
# from src.utils.preprocessing.duplicates import drop_duplicates_with_unhashable
# merged_df = drop_duplicates_with_unhashable(merged_df, ['user_id', 'timestamp'])
subset_columns = ['rating', 'title_review', 'text','asin',
       'parent_asin', 'user_id', 'timestamp', 'helpful_vote',
       'verified_purchase', 'main_category', 'title_metadata',
       'average_rating', 'rating_number', 'price',
        'store', 'bought_together', 'subtitle', 'author']
df.drop_duplicates(subset=subset_columns, inplace=True)

# Find the most frequent label in the main_category column and
# filter the DataFrame to keep only rows with the most frequent label (some were in wrong category)
main_label = df["main_category"].value_counts().idxmax()
df = df[df["main_category"] == main_label]

# Save it

In [6]:
dest_folder = Path("../data/raw")
save_name = os.path.join(dest_folder, f'merged_dataset.parquet')
df.to_parquet(save_name, index=False)