1. Loading data for reviews and metadata

In [None]:
from datasets import load_dataset

# Load the reviews dataset for Gift Cards
gift_cards_reviews = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    name="raw_review_Gift_Cards",
    split="full",
    trust_remote_code=True
)

# Load the metadata dataset for Gift Cards
gift_cards_metadata = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    name="raw_meta_Gift_Cards",
    split="full",
    trust_remote_code=True
)


2. Processing data(use parent_asin to link reviews and metadata)

In [5]:
from collections import defaultdict

# Step 1: Create a dictionary to map parent_asin to titles
metadata_by_asin = {meta["parent_asin"]: meta["title"] for meta in gift_cards_metadata}

# Step 2: Combine reviews with metadata
combined_data = {
    "title": [],
    "text": [],
}

for review in gift_cards_reviews:
    parent_asin = review.get("parent_asin")  # Get parent_asin from review
    if parent_asin in metadata_by_asin:
        combined_data["title"].append(metadata_by_asin[parent_asin])  # Match title using parent_asin
        combined_data["text"].append(review["text"])  # Add review text


from datasets import Dataset

# Convert combined data to a Hugging Face Dataset
processed_dataset = Dataset.from_dict(combined_data)

# Preview the processed dataset
print(processed_dataset[0])


{'title': 'Amazon Reload', 'text': 'Having Amazon money is always good.'}


3. Testing for data format

In [6]:
# Test few rows to make sure data is processed correctly

for i in range(3):
    print(processed_dataset[i])



{'title': 'Amazon Reload', 'text': 'Having Amazon money is always good.'}
{'title': 'Amazon.com Gift Card in a Black Gift Box (Thank You Card Design)', 'text': "Always the perfect gift.  I have never given one and had someone seem or act disappointed.  Just the opposite.  They are thrilled and excited to have a bit of a spree.  Always the perfect size and color!  Arrives in 1 day in most cases.  So it's never too late!  Lots of cards to chose from... thank you... birthday... wedding..baby..  and many that work for many occasions..."}
{'title': 'Amazon.com Gift Card in a Gift Box (Various Thank You Designs)', 'text': "When you have a person who is hard to shop for.. an amazon gift card is P E R F E C T.  Man or woman...  No matter what their hobby... lifestyle.. or age.  All you have to do is pick the $.  Don't forget to mention that it is a GIFT when you check out - you will have some gift card options.  I've ordered many of these over years.  They are always received with glee.  Woo h

4. Import language model

In [7]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

# Define arguments for loading the model
class args:
    model_name_or_path = "EleutherAI/gpt-neo-125m"
    cache_dir = "./cache/"
    model_revision = "main"
    use_fast_tokenizer = True

# Load model configuration
config = AutoConfig.from_pretrained(
    args.model_name_or_path,
    cache_dir=args.cache_dir,
    revision=args.model_revision,
    use_auth_token=None,  # No authentication token required for this public model
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    cache_dir=args.cache_dir,
    use_fast=args.use_fast_tokenizer,
    revision=args.model_revision,
    use_auth_token=None,
)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),  # Handle TensorFlow checkpoints if applicable
    config=config,
    cache_dir=args.cache_dir,
    revision=args.model_revision,
    use_auth_token=None,
)

# Print model summary
print(f"Model loaded: {args.model_name_or_path}")
print(f"Model parameters: {model.num_parameters()}")


Model loaded: EleutherAI/gpt-neo-125m
Model parameters: 125198592


5. Prepare and check final dataset

In [8]:
# Assign padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Combine title and text if the dataset isn't preprocessed
def combine_columns(example):
    return {"text": f"<Product Title>: {example['title']} <Review>: {example['text']}"}

# Preprocess the dataset
train_dataset = processed_dataset.map(combine_columns)

# Check the train dataset
print("Sample entry from train_dataset:")
print(train_dataset[0])

# Verify the number of entries
print(f"Number of entries in train_dataset: {len(train_dataset)}")



Map:   0%|          | 0/152410 [00:00<?, ? examples/s]

Sample entry from train_dataset:
{'title': 'Amazon Reload', 'text': '<Product Title>: Amazon Reload <Review>: Having Amazon money is always good.'}
Number of entries in train_dataset: 152410


6. Train model

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

# Customize training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    save_steps=10,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    max_steps=50,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    gradient_checkpointing=True,
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Start training
trainer.train()
