In [1]:
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import re
from collections import defaultdict
from datasets import load_from_disk
from datasets import Dataset, DatasetDict
from datasets import Dataset
from tqdm import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
category="Books"
print(f"Loading datasets for category: {category}")
review_dataset = load_from_disk(
    f"/root/autodl-tmp/dataset/raw_review_{category}"
)
meta_dataset = load_from_disk(
    f"/root/autodl-tmp/dataset/raw_meta_{category}"
)

Loading datasets for category: Books


In [3]:
# ---------------------------
# Preprocessing Functions
# ---------------------------

def review_filter(x):
    # Filter reviews with sufficient content
    return (
        x["text"]
        and len(x["text"]) >= 100
        and x["rating"]
        and x["title"]
        and x["timestamp"]
    )

def create_description(x):
    # Extract and concatenate product description
    desc = x["description"]
    if not desc or (desc[0] == "Product Description" and len(desc) == 1):
        return ""
    if desc[0] == "Product Description":
        return " ".join(desc[1:])
    return " ".join(desc)

def meta_filter(x):
    # Filter meta entries with meaningful information
    return (
        100 <= len(create_description(x)) <= 2000
        and x["title"]
        and x["categories"]
        and x["rating_number"]
    )

In [4]:
# ---------------------------
# Meta Dataset Preprocessing
# ---------------------------

meta_columns = [
    col for col in meta_dataset.column_names if col not in {"title", "description"}
]
meta_dataset = meta_dataset.filter(
    meta_filter, num_proc=64, load_from_cache_file=False
)
meta_dataset = meta_dataset.map(
    lambda x: {
        "asin": x["parent_asin"],
        "title": x["title"],
        "description": create_description(x),
    },
    num_proc=64,
    remove_columns=meta_columns,
    load_from_cache_file=False,
)

valid_asins = set(meta_dataset["asin"])

Filter (num_proc=64): 100%|██████████| 4448181/4448181 [00:08<00:00, 512941.04 examples/s]
Map (num_proc=64): 100%|██████████| 1174554/1174554 [00:04<00:00, 243312.17 examples/s]


In [5]:
# ---------------------------
# Review Dataset Preprocessing
# ---------------------------

review_columns = [
    col
    for col in review_dataset.column_names
    if col not in {"user_id", "asin", "title", "text", "rating", "timestamp"}
]
review_dataset = review_dataset.filter(
    review_filter, num_proc=64, load_from_cache_file=False
)
review_dataset = review_dataset.map(
    lambda x: {
        "user_id": x["user_id"],
        "asin": x["parent_asin"],
        "title": x["title"].strip(),
        "text": x["text"].strip(),
        "rating": x["rating"],
        "timestamp": x["timestamp"],
    },
    num_proc=64,
    remove_columns=review_columns,
    load_from_cache_file=False,
)
review_dataset = review_dataset.filter(
    lambda x: x["asin"] in valid_asins, num_proc=64, load_from_cache_file=False
)


Filter (num_proc=64): 100%|██████████| 29475453/29475453 [00:09<00:00, 3052047.58 examples/s]
Map (num_proc=64): 100%|██████████| 20438691/20438691 [00:58<00:00, 346962.86 examples/s]
Filter (num_proc=64): 100%|██████████| 20438691/20438691 [00:06<00:00, 2998152.78 examples/s]


In [6]:
K = 6

In [7]:
def filter_users(dataset):
    user_review = defaultdict(list)

    print("Building user → review map...")
    for r in tqdm(dataset, desc="Collecting reviews"):
        user_review[r["user_id"]].append({
            "asin": r["asin"],
            "title": r["title"],
            "text": r["text"],
            "rating": r["rating"],
            "timestamp": r["timestamp"],
        })

    print("Deduplicating & filtering...")
    filtered_user_review = {}
    for user in tqdm(user_review, desc="Processing users"):
        reviews = user_review[user]
        reviews.sort(key=lambda r: r["timestamp"], reverse=True)

        seen_asin, seen_text = set(), set()
        deduped = []
        for r in reviews:
            if r["asin"] not in seen_asin and r["text"] not in seen_text:
                seen_asin.add(r["asin"])
                seen_text.add(r["text"])
                deduped.append(r)

        if len(deduped) >= K:
            filtered_user_review[user] = deduped

    print("Rebuilding dataset...")
    all_filtered = [r for reviews in filtered_user_review.values() for r in reviews]
    filtered_dataset = Dataset.from_list(all_filtered)

    return filtered_dataset, filtered_user_review


review_dataset, user_review = filter_users(review_dataset)

Building user → review map...


Collecting reviews: 100%|██████████| 6579093/6579093 [08:59<00:00, 12197.29it/s]


Deduplicating & filtering...


Processing users: 100%|██████████| 3399430/3399430 [00:08<00:00, 392546.03it/s]


Rebuilding dataset...


In [8]:
def clean_and_split_reviews():
    for user_id, reviews in tqdm(user_review.items()):
        # Sort by time and clean text
        reviews.sort(key=lambda x: x["timestamp"], reverse=False)
        for r in reviews:
            r["text"] = re.sub(r"\s+", " ", r["text"].strip())

    # Split into profile (past) and input (future) reviews
    user_review_split = {}
    for user_id, reviews in tqdm(user_review.items()):
        reviews = reviews[-K:]
        history, target = reviews[:-1], reviews[-1]
        user_review_split[user_id] = (history, target)
    return user_review_split

user_review_split = clean_and_split_reviews()

  0%|          | 0/129341 [00:00<?, ?it/s]

100%|██████████| 129341/129341 [01:16<00:00, 1698.06it/s]
100%|██████████| 129341/129341 [00:00<00:00, 422898.51it/s]


In [None]:
seed = 42
random.seed(seed)

all_users = list(user_review_split.keys())
random.shuffle(all_users)

n_total = len(all_users)
n_train = int(n_total * 0.8)
n_val = int(n_total * 0.1)
n_test = n_total - n_train - n_val

train_users = set(all_users[:n_train])
val_users = set(all_users[n_train:n_train + n_val])
test_users = set(all_users[n_train + n_val:])

train_data, val_data, test_data = [], [], []
for user_id, (history, target) in tqdm(user_review_split.items()):
    if user_id in train_users:
        train_data.append((user_id, history, target))
    if user_id in val_users:
        val_data.append((user_id, history, target))
    if user_id in test_users:
        test_data.append((user_id, history, target))

100%|██████████| 129341/129341 [00:00<00:00, 811376.83it/s]


In [None]:
len(train_users), len(val_users), len(test_users)

(103472, 12934, 12935)

In [None]:
def convert_to_dataset(data):
    user_ids, historys, targets = zip(*data)
    return Dataset.from_dict(
        {
            "user_id": list(user_ids),
            "history": list(historys),
            "target": list(targets),
        }
    )
train_dataset = convert_to_dataset(train_data)
val_dataset = convert_to_dataset(val_data)
test_dataset = convert_to_dataset(test_data)

In [None]:
main_dataset = DatasetDict(
    {
        "train": train_dataset,
        "val": val_dataset,
        "test": test_dataset,
    }
)
meta_dataset = DatasetDict({"full": meta_dataset})

In [13]:
# ---------------------------
# Save to Disk
# ---------------------------

main_dataset.save_to_disk(f"/root/autodl-tmp/dataset/review_{category}")
meta_dataset.save_to_disk(f"/root/autodl-tmp/dataset/meta_{category}")

Saving the dataset (1/1 shards): 100%|██████████| 103472/103472 [00:00<00:00, 367472.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12934/12934 [00:00<00:00, 326420.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12935/12935 [00:00<00:00, 322333.98 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 1174554/1174554 [00:01<00:00, 1165474.57 examples/s]
