# Build the amazon reviews dataset

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from collections import defaultdict
from experiments.dataset_info import *

## Inspect dataset

In [None]:
# full_amazon_category_dict

In [None]:
# Inspect different datasets within amazon_reviews
# dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", trust_remote_code=True)
# dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
# dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "0core_rating_only_All_Beauty", trust_remote_code=True)
# dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "0core_last_out_All_Beauty", trust_remote_code=True)
# dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "0core_timestamp_w_his_All_Beauty", trust_remote_code=True)

In [None]:
dataset_beauty = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True, streaming=True)
ds_beauty = iter(dataset_beauty["full"])

In [None]:
next(ds_beauty)

In [None]:
dataset_electronics = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True, streaming=True)
ds_electronics = iter(dataset_electronics["full"])

In [None]:
next(ds_electronics)

## Make dataset for experiments

In [None]:
def process_sample(sample, category, char_count_range):
    return {
        'text': f'Title: {sample["title"]}\nReview: {sample["text"][:char_count_range[-1]]}',
        'rating': amazon_rating_dict[sample['rating']],
        'category': category,
    }

def filter_and_sample(dataset, ratings, category, n_samples, char_count_range):
    rating_counter = {rating: 0 for rating in ratings}
    samples_dict = defaultdict(list)
    for sample in dataset:
        l = len(sample['text'])
        if l < char_count_range[0]:
            continue
        r = sample['rating'] 
        if r in rating_counter:
            samples_dict[r].append(process_sample(sample, category, char_count_range))
            rating_counter[r] += 1
            if rating_counter[r] >= n_samples:
                rating_counter.pop(r) # desired number of samples reached
        if len(rating_counter) == 0:
            break
    return samples_dict

def split_samples_dict(samples_dict, n_train_samples_per_rating, n_test_samples_per_rating):
    train_samples = []
    test_samples = []
    for samples in samples_dict.values():
        train_samples.extend(samples[:n_train_samples_per_rating])
        test_samples.extend(samples[n_train_samples_per_rating:n_train_samples_per_rating+n_test_samples_per_rating])
    return train_samples, test_samples # contains multiple rating_scores

In [None]:
# Load the datasets
# dataset_electronics = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True, streaming=True)
# dataset_beauty = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True, streaming=True)
# dataset_dict = {
#     'electronics': dataset_electronics['full'],
#     'beauty': dataset_beauty['full'],
# }

categories_to_load = full_amazon_category_dict.keys()
dataset_dict = {
    category: load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{category}", split='full', trust_remote_code=True, streaming=True)
    for category in categories_to_load
}

ratings = (1.0, 5.0) # There's no 0 star rating in the dataset
# ratings = (1.0, 2.0, 3.0, 4.0, 5.0) # There's no 0 star rating in the dataset
char_count_range = [500, 750]
n_train_samples_per_rating = 10000
n_test_samples_per_rating = 2500
n_total_samples_per_rating = n_train_samples_per_rating + n_test_samples_per_rating

train_samples = []
test_samples = []
for category, dataset in dataset_dict.items():
    print(f"Processing {category} dataset...")
    category_idx = full_amazon_category_dict[category]
    samples_dict = filter_and_sample(dataset, ratings, category_idx, n_total_samples_per_rating, char_count_range)
    train_samples_category, test_samples_category = split_samples_dict(samples_dict, n_train_samples_per_rating, n_test_samples_per_rating)
    train_samples.extend(train_samples_category)
    test_samples.extend(test_samples_category)

balanced_dataset = DatasetDict({
    'train': Dataset.from_list(train_samples),
    'test': Dataset.from_list(test_samples),
})

print(len(balanced_dataset['train']), len(balanced_dataset['test']))

In [None]:
balanced_dataset

In [None]:
# Upload to huggingface

ratings_part = "_".join(str(rating) for rating in ratings)
categories_part = "_".join(dataset_dict.keys())

fname = f"dataset_{ratings_part}_{categories_part}_{n_total_samples_per_rating}"
fname = 'dataset_all_categories_ratings_1and5_train10000_test2500'

# balanced_dataset.push_to_hub(repo_id = f"canrager/amazon_reviews_mcauley", config_name=f"{fname}")

### Inspect how bib is formatted

In [None]:
from experiments.probe_training import load_and_prepare_dataset


bib_train_df, bib_test_df = load_and_prepare_dataset('bias_in_bios')

In [None]:
bib_train_df.head()

In [None]:
len(bib_train_df), len(bib_test_df)

In [None]:
result = bib_train_df.groupby(['profession', 'gender']).size().unstack(fill_value=0)
print(result)

In [None]:
# Try running stuff with this dataset: replicate test_interventions.py