1. Loading data for reviews and metadata

In [1]:
from datasets import load_dataset

# Load the reviews dataset for Gift Cards
reviews = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    name="raw_review_Video_Games",
    split="full",
    trust_remote_code=True
)

Video_Games.jsonl:   0%|          | 0.00/2.68G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [2]:
# Load the metadata dataset for Gift Cards
metadata = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    name="raw_meta_Video_Games",
    split="full",
    trust_remote_code=True
)

meta_Video_Games.jsonl:   0%|          | 0.00/437M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [3]:
reviews_df = reviews.to_pandas()
metadata_df = metadata.to_pandas()

In [4]:
reviews_df = reviews_df[['text', 'parent_asin']]
metadata_df = metadata_df[['title', 'parent_asin']]

2. Processing data(use parent_asin to link reviews and metadata)

In [5]:
import pandas as pd

In [6]:
processed_dataset = pd.merge(reviews_df, metadata_df, on='parent_asin', how='inner')

In [7]:
processed_dataset = processed_dataset[['title', 'text']]

3. Testing for data format

In [10]:
# Test few rows to make sure data is processed correctly
processed_dataset['title'][:10]



0      Cyberpunk 2077 - PC [Game Download Code in Box]
1            Final Fantasy VII: Remake - PlayStation 4
2    Sid Meier’s Civilization VI: Rise and Fall [On...
3    PowerA Enhanced Wireless Controller for Ninten...
4      KontrolFreek FPS Freek CQC Signature - Xbox One
5    havit Gaming Keyboard and Mouse Combo, Backlit...
6    Playstation Plus: 3 Month Membership [Digital ...
7    Razer Kraken Gaming Headset: Lightweight Alumi...
8    Aenllosi Hard Carrying Case Compatible with Se...
9    PlayStation Plus: 1 Month Membership [Digital ...
Name: title, dtype: object

4. Import language model

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

# Define arguments for loading the model
class args:
    model_name_or_path = "EleutherAI/gpt-neo-125m"
    cache_dir = "./cache/"
    model_revision = "main"
    use_fast_tokenizer = True

# Load model configuration
config = AutoConfig.from_pretrained(
    args.model_name_or_path,
    cache_dir=args.cache_dir,
    revision=args.model_revision,
    use_auth_token=None,  # No authentication token required for this public model
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    cache_dir=args.cache_dir,
    use_fast=args.use_fast_tokenizer,
    revision=args.model_revision,
    use_auth_token=None,
)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),  # Handle TensorFlow checkpoints if applicable
    config=config,
    cache_dir=args.cache_dir,
    revision=args.model_revision,
    use_auth_token=None,
).to("cuda")

# Print model summary
print(f"Model loaded: {args.model_name_or_path}")
print(f"Model parameters: {model.num_parameters()}")


5. Prepare and check final dataset

In [None]:
# Assign padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [66]:
def get_data(df) -> dict:
    # transform dataframe into dataset
    data_dict = Dataset.from_pandas(
        df
    ).train_test_split(
        0.1
    ).map(
        format_item
    ).remove_columns(
        ["title", "text"]
    )
    
    return data_dict

# format question-answer pair into content to be fed in to model
def format_item(item) -> dict:
    product = item["title"].strip()
    review = item["text"].strip()
    return { "content": f"Product: {product}\nReview: {review}" }

data_dict = get_data(processed_dataset)

Map:   0%|          | 0/1915744 [00:00<?, ? examples/s]

Map:   0%|          | 0/212861 [00:00<?, ? examples/s]

In [67]:
data_dict

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 1915744
    })
    test: Dataset({
        features: ['content'],
        num_rows: 212861
    })
})

6. Train model

In [68]:
from transformers import TrainingArguments
from trl import SFTTrainer

# Customize training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    save_steps=1000,
    logging_steps=100,
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    max_steps=5000,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    gradient_checkpointing=True,
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=data_dict['train'],
    eval_dataset=data_dict['test'],
    dataset_text_field="content",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Start training
trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/1915744 [00:00<?, ? examples/s]

Map:   0%|          | 0/212861 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
100,15.5142
200,12.8847
300,12.0
400,11.367
500,10.8844
600,10.4448
700,10.2778
800,10.1183
900,9.8829
1000,9.6315


TrainOutput(global_step=5000, training_loss=8.939485327148438, metrics={'train_runtime': 1231.6987, 'train_samples_per_second': 64.951, 'train_steps_per_second': 4.059, 'total_flos': 3278018468118528.0, 'train_loss': 8.939485327148438, 'epoch': 0.04175923296640887})

In [75]:
example_inputs = tokenizer([f"Product: Samsung dryer\nReview:"], return_tensors="pt").to(model.device)
example_outputs = model.generate(
    **example_inputs, 
    max_length=200,
    temperature=0.7,
    do_sample=True,
    top_p=0.9,
    use_cache=True,
    pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(example_outputs[0])

In [None]:
print(output_text)