In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

In [None]:
# Define the categories
categories = [
    # "Digital_Music",
    # "Magazine_Subscriptions",
    # "Movies_and_TV",
    # "Musical_Instruments",
    # "Sports_and_Outdoors",
    # "Toys_and_Games",
    "Video_Games"
]

In [None]:
# Function to load, clean, and transform the dataset
def load_and_clean_category(category):
    # Load the dataset
    dataset = load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        f"raw_review_{category}",
        trust_remote_code=True,
    )

    # Determine available splits
    available_splits = list(dataset.keys())
    print(f"Available splits for {category}: {available_splits}")

    # Use the first available split (assuming it has data)
    split_name = available_splits[0]

    # Convert to pandas DataFrame
    df = pd.DataFrame(dataset[split_name])

    # Convert 'timestamp' column to datetime and filter by 2022-2023
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
        df = df[(df['timestamp'] >= '2022-01-01') & (df['timestamp'] <= '2023-12-31')]

    # Remove the 'images' column
    if 'images' in df.columns:
        df = df.drop(columns=['images'])

    # Rename columns
    df = df.rename(columns={
        'title': 'review_title',
        'text': 'review_text',
        'timestamp': 'date'
    })

    return df

In [None]:
# Load and clean datasets for all categories with a progress bar
category_dataframes = {}
for category in tqdm(categories, desc="Loading and cleaning datasets"):
    category_dataframes[category] = load_and_clean_category(category)

Loading and cleaning datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.68G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Available splits for Video_Games: ['full']


Loading and cleaning datasets: 100%|██████████| 1/1 [13:10<00:00, 790.43s/it]


In [None]:
# Export the cleaned dataframes to CSV files
for category, df in category_dataframes.items():
    df.to_csv(f"{category}_reviews_processed.csv", index=False)

In [None]:
# Example: Access the cleaned dataframe for Digital_Music
digital_music_df = category_dataframes['Digital_Music']
digital_music_df


Unnamed: 0,rating,review_title,review_text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
7,5.0,Giftfor mom. She loves it.,It took two months before she got her first ma...,B00005R8BR,B00005R8BR,AGRUKTHXESHTPHHK4GQYQNRFZP3A,2023-01-14 22:00:58.264,0,True
12,5.0,Good magozine,I got. This Magizime for many yeats,B01IAIX74Y,B01IAIX74Y,AETAH6GK4TSOTHIVPBFPYZMIQRRA,2022-01-10 20:26:59.925,0,True
47,4.0,New Mexico Travel and Culture,I like magazines that emphasize travel and cul...,B00006KPSH,B00006KPSH,AFERCDY2EFJKT7QUQ75GISNHTFOQ,2022-03-25 18:23:30.565,0,False
117,5.0,Love this gossipy magazine,"I love this ""gossipy"" tabloid magazine! It ma...",B002XPVNOC,B002XPVNOC,AGNFCXSJSYDHFRF4ZKIEDV3GVZFA,2022-08-09 08:10:29.892,6,True
126,5.0,This was a gift for my grandson,This was a gift,B07DRWHT3P,B07DRWHT3P,AFAR7QJKL73Z6OYJ7343HUIDZOEA,2022-03-23 22:29:28.253,0,True
...,...,...,...,...,...,...,...,...,...
71251,2.0,It's not a magazine worth reordering,I was after good information on professional b...,B00HETE8G2,B00HETE8G2,AFYFMJUJ6RWXSYMXG27IK3Z76AXQ,2022-11-11 19:01:38.429,1,True
71325,1.0,Terrible!,It took forever to get the first issue and I t...,B00P06VXRS,B00P06VXRS,AERJCBN4CQNDFL6BG2NXVQ76AJAQ,2022-08-12 00:51:07.648,0,True
71340,1.0,It doesn’t come with any pattern.,It does not come with any patterns or instruct...,B000063XJP,B000063XJP,AGVXVWSIA2RJ5ZVAGEMLPIR3UFMQ,2022-01-20 19:44:50.333,0,True
71358,5.0,Wonderful nooks.,We love these book. I buy them for my grandchi...,B003I7HS4I,B003I7HS4I,AH5TW772DHT5DQ7FKJZB4XW6ZONQ,2023-01-30 20:25:40.575,1,True
