In [1]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/sst2")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 67349/67349 [00:00<00:00, 1677985.70 examples/s]
Generating validation split: 100%|██████████| 872/872 [00:00<00:00, 688782.13 examples/s]
Generating test split: 100%|██████████| 1821/1821 [00:00<00:00, 1287997.91 examples/s]


In [21]:
dataset["test"][0]

{'idx': 0, 'sentence': 'uneasy mishmash of styles and genres .', 'label': -1}

In [5]:
## Dataset Sentence Analysis

max_len = max([len(sent) for sent in dataset["train"]["sentence"]])
min_len = min([len(sent) for sent in dataset["train"]["sentence"]])
avg_len = sum([len(sent) for sent in dataset["train"]["sentence"]]) / len(dataset["train"]["sentence"])

max_len_test = max([len(sent) for sent in dataset["test"]["sentence"]])
min_len_test = min([len(sent) for sent in dataset["test"]["sentence"]])
avg_len_test = sum([len(sent) for sent in dataset["test"]["sentence"]]) / len(dataset["test"]["sentence"])

print(f"Train set: {len(dataset['train'])} samples")
print(f"Test set: {len(dataset['test'])} samples")
print(f"Max sentence length in train set: {max_len}")
print(f"Min sentence length in train set: {min_len}")
print(f"Average sentence length in train set: {avg_len}")
print(f"Max sentence length in test set: {max_len_test}")
print(f"Min sentence length in test set: {min_len_test}")
print(f"Average sentence length in test set: {avg_len_test}")

Train set: 67349 samples
Test set: 1821 samples
Max sentence length in train set: 268
Min sentence length in train set: 2
Average sentence length in train set: 53.5055902834489
Max sentence length in test set: 256
Min sentence length in test set: 6
Average sentence length in test set: 102.96101043382757


In [19]:
## Dataset Label Analysis

labels = set(dataset["validation"]["label"])
print(f"Labels: {labels}")

Labels: {0, 1}


In [10]:
## Pass HF Token
hf_token = "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"

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", token=hf_token)

init_prompt_template = r"""Classify the sentiment of the following sentence as Positive or Negative: 
{sentence}

Output: {output}"""

In [20]:
## Create Dataset with Prompt

def create_prompt_dataset(dataset, prompt_template):
    label_map = {0: "Negative", 1: "Positive"}
    prompt_dataset = {}
    prompt_dataset["prompt"] = []
    prompt_dataset["label"] = []
    for sample in dataset:
        prompt = prompt_template.format(sentence=sample["sentence"], output=label_map[sample["label"]])
        prompt_dataset["prompt"].append(prompt)
        prompt_dataset["label"].append(label_map[sample["label"]])
    return prompt_dataset

prompt_dataset = {}
prompt_dataset["train"] = create_prompt_dataset(dataset["train"], init_prompt_template)
prompt_dataset["validation"] = create_prompt_dataset(dataset["validation"], init_prompt_template)

In [24]:
from datasets import Dataset

train = Dataset.from_dict(prompt_dataset["train"])
validation = Dataset.from_dict(prompt_dataset["validation"])


Dataset({
    features: ['prompt', 'label'],
    num_rows: 68221
})

In [35]:
target_max_length = 4
prompt_max_length = 512
def preprocess_dataset(exmaple):
    inputs = tokenizer(exmaple["prompt"], return_tensors="pt", padding="max_length", truncation=True, max_length=prompt_max_length)
    targets = tokenizer(exmaple["label"], return_tensors="pt", padding="max_length", truncation=True, max_length=target_max_length)
    labels = targets["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    inputs["labels"] = labels
    return inputs

train_data = train.map(preprocess_dataset, batched=True)
validation_data = validation.map(preprocess_dataset, batched=True)

Map: 100%|██████████| 67349/67349 [00:06<00:00, 10972.68 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 11044.01 examples/s]


In [36]:
train_data.set_format(type="torch")
validation_data.set_format(type="torch")

In [37]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
validation_loader = DataLoader(validation_data, batch_size=8, shuffle=False)

In [38]:
for batch in train_loader:
    print(batch)
    break

{'prompt': ['Classify the sentiment of the following sentence as Positive or Negative: \nbe of interest primarily to its target audience \n\nOutput: Positive', "Classify the sentiment of the following sentence as Positive or Negative: \n'll be white-knuckled and unable to look away . \n\nOutput: Positive", 'Classify the sentiment of the following sentence as Positive or Negative: \nwill absolutely crack you up with her crass , then gasp for gas , verbal deportment \n\nOutput: Positive', 'Classify the sentiment of the following sentence as Positive or Negative: \nmade me unintentionally famous -- as the queasy-stomached critic who staggered from the theater and blacked out in the lobby \n\nOutput: Negative', 'Classify the sentiment of the following sentence as Positive or Negative: \nas the queasy-stomached critic who staggered from the theater and blacked out in the lobby \n\nOutput: Negative', 'Classify the sentiment of the following sentence as Positive or Negative: \npained \n\nOutp

In [None]:
import torch
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit
DEVICE = "mps" if torch.mps.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", token=hf_token)

peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=100,
    prompt_tuning_init_text=PromptTuningInit.RANDOM,
    inference_mode=False,
    tokenizer_name_or_path="google/gemma-2b",
)

model = get_peft_model(model, peft_config)
model.to(DEVICE)

In [None]:
## Define Optimizer and Scheduler

from transformers import AdamW, get_linear_schedule_with_warmup


num_epochs = 3
optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_loader) * num_epochs),
)

In [None]:
model.train()

for epoch in range(num_epochs):
    for batch in train_loader:
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(DEVICE)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        ## Checkpoint

        if (train_loader.iterations % 100) == 0:
            print(f"Iteration {train_loader.iterations}, Loss: {loss.item()}")
            model.save_pretrained("sst2-checkpoint")