# Imports

In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM
from trl import SFTConfig, SFTTrainer
import torch
import os

torch.cuda.empty_cache()

# Load dataset

In [2]:
# Load the reviews dataset
reviews = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    name="raw_review_Video_Games",
    split="full",
    trust_remote_code=True
)

In [3]:
# Load the metadata dataset
metadata = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    name="raw_meta_Video_Games",
    split="full",
    trust_remote_code=True
)

# Prepare dataset for training

In [4]:
reviews_df = reviews.to_pandas()
metadata_df = metadata.to_pandas()

processed_dataset = pd.merge(
    reviews_df[['text', 'parent_asin']], 
    metadata_df[['title', 'parent_asin']], 
    on='parent_asin', 
    how='inner'
)
processed_dataset = processed_dataset[['title', 'text']]

In [5]:
def get_data(df) -> dict:
    # transform dataframe into dataset
    data_dict = Dataset.from_pandas(
        df
    ).train_test_split(
        0.1
    ).map(
        format_item
    ).remove_columns(
        ["title", "text"]
    )
    
    return data_dict

def format_item(item) -> dict:
    product = item["title"].strip()
    review = item["text"].strip()
    return { "content": f"Product: {product}\nReview: {review}" }

data_dict = get_data(processed_dataset)

Map:   0%|          | 0/4162153 [00:00<?, ? examples/s]

Map:   0%|          | 0/462462 [00:00<?, ? examples/s]

# Train model

In [3]:
def get_most_recent_model():
    latest_edited_file = max([f for f in os.scandir("finetuned/")], key=lambda x: x.stat().st_mtime).name
    return "finetuned/" + latest_edited_file
get_most_recent_model()

'finetuned/checkpoint-329000'

In [None]:
# set configurations
sft_config = SFTConfig(
    dataset_text_field="content",
    max_seq_length=2048,
    output_dir="finetuned",
    learning_rate=3e-05,
    lr_scheduler_type="cosine",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    bf16=True,
    save_steps=1000,
    logging_steps=100,
)

print("loading model")
# load model
model = AutoModelForCausalLM.from_pretrained(
    "/scratch/bchk/aguha/models/llama3p2_1b_base",
    torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2"
).to("cuda")

# initialize trainer
trainer = SFTTrainer(
    model,
    train_dataset=data_dict["train"],
    eval_dataset=data_dict["test"],
    args=sft_config,
)

print("starting training")
trainer.train()

loading model


Map:   0%|          | 0/4162153 [00:00<?, ? examples/s]

Map:   0%|          | 0/462462 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


starting training


Step,Training Loss
100,3.1616
200,3.0948
300,3.0129
400,3.0499
500,2.9729
600,2.9632
700,3.0362
800,2.9382
900,2.9153
1000,2.9628


In [None]:
model.save_pretrained("llama_3p2_1b_tuned/")

6. Train model

In [2]:
from transformers import AutoTokenizer

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    get_most_recent_model(),
    torch_dtype=torch.bfloat16,
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("/scratch/bchk/aguha/models/llama3p2_1b_base", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

In [5]:
example_inputs = tokenizer(["Question: Tell me one bad thing about Final Fantasy VII: Remake - PlayStation 4\nAnswer:"], return_tensors="pt")
example_outputs = model.generate(
    **example_inputs, 
    max_length=200,
    temperature=0.7,
    do_sample=True,
    top_p=0.9,
    use_cache=True,
    pad_token_id=tokenizer.eos_token_id)
output_text = '\n'.join(tokenizer.decode(example_outputs[0]).split("\n")[:2])
print(output_text)

<|begin_of_text|>Question: Tell me one bad thing about Final Fantasy VII: Remake - PlayStation 4
Answer: The game is still bad. It has no content to speak of. The story is awful. It's a game made for kids. They should have taken the time to actually write a good story instead of just throwing random stuff into the game. I'm not even going to even mention the fact that the game is just so repetitive. I could go on forever about how bad this game is. But I'm not going to waste my time on that. Just buy it if you want to be a fan of the game. If not, don't waste your time on this garbage. I'm a big fan of the original and I have been waiting for this game since it was announced. I'm just not impressed. I'm not impressed with the game itself. I'm not impressed with the game's creators. I'm not impressed with the game's production. I'm not impressed with
