In [1]:
%pip install -q peft transformers datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from tqdm import tqdm
import os

In [3]:
from datasets import load_dataset
# Import and split the data
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)


Downloading builder script:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [8]:
# observe the "label" exists as numerical quantity, need to convert it into actual labels
dataset["train"]

Dataset({
    features: ['sentence', 'label'],
    num_rows: 2037
})

In [9]:
# Extract actual label names
classes = dataset["train"].features["label"].names
print(classes)
# Apply the names to the train dataset
dataset = dataset.map(
   lambda x: {"text_label": [classes[label] for label in x["label"]]},
   batched=True,
   num_proc=1,
)

['negative', 'neutral', 'positive']


Map:   0%|          | 0/2037 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

In [10]:
dataset["train"][0]

{'sentence': 'The total size of the complex is around 25,000 m2 and the project will be constructed in stages .',
 'label': 1,
 'text_label': 'neutral'}

In [11]:
from transformers import AutoTokenizer
# Setup the tokenizer for our LLM
tokenizer_name_or_path = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
# Define the input text and respective label columns
text_column = "sentence"
label_column = "text_label"
max_length = 128
# Function to preprocess the data
def preprocess_function(examples):
   inputs = examples[text_column]
   targets = examples[label_column]
   model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
   labels = tokenizer(targets, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
   labels = labels["input_ids"]
   labels[labels == tokenizer.pad_token_id] = -100
   model_inputs["labels"] = labels
   return model_inputs


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [12]:
processed_datasets = dataset.map(
   preprocess_function,
   batched=True,
   num_proc=1,
   remove_columns=dataset["train"].column_names,
   load_from_cache_file=False,
   desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/2037 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/227 [00:00<?, ? examples/s]

In [13]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
# Create Data loader obejects
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]
batch_size = 8
train_dataloader = DataLoader(
   train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(
   eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)


In [14]:
from peft import PromptEmbedding, PromptTuningConfig
peft_config = PromptTuningConfig(
   peft_type="PROMPT_TUNING",
   task_type="SEQ_2_SEQ_LM",
   inference_mode = False,
   num_virtual_tokens=20,
   prompt_tuning_init="TEXT",
   prompt_tuning_init_text="Predict if sentiment of this review is positive, negative or neutral",
   tokenizer_name_or_path=tokenizer_name_or_path,
)

In [15]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, TaskType
# Initialize the LLM with prompt-tuning configuration
model_name_or_path = "google/flan-t5-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 40,960 || all params: 783,191,040 || trainable%: 0.0052


In [16]:
from transformers import get_linear_schedule_with_warmup
lr = 1e-2
num_epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
   optimizer=optimizer,
   num_warmup_steps=0,
   num_training_steps=(len(train_dataloader) * num_epochs),
)


In [17]:
#model = model.to(device)
for epoch in range(num_epochs):
   model.train()
   total_loss = 0
   for step, batch in enumerate(tqdm(train_dataloader)):
       #batch = {k: v.to(device) for k, v in batch.items()}
       outputs = model(**batch)
       loss = outputs.loss
       total_loss += loss.detach().float()
       loss.backward()
       optimizer.step()
       lr_scheduler.step()
       optimizer.zero_grad()
   model.eval()
   eval_loss = 0
   eval_preds = []
   for step, batch in enumerate(tqdm(eval_dataloader)):
       #batch = {k: v.to(device) for k, v in batch.items()}
       with torch.no_grad():
           outputs = model(**batch)
       loss = outputs.loss
       eval_loss += loss.detach().float()
       eval_preds.extend(
           tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
       )
   eval_epoch_loss = eval_loss / len(eval_dataloader)
   eval_ppl = torch.exp(eval_epoch_loss)
   train_epoch_loss = total_loss / len(train_dataloader)
   train_ppl = torch.exp(train_epoch_loss)
   print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")


100%|██████████| 255/255 [46:44<00:00, 11.00s/it]  
100%|██████████| 29/29 [01:37<00:00,  3.36s/it]


epoch=0: train_ppl=tensor(1.9475) train_epoch_loss=tensor(0.6666) eval_ppl=tensor(1.1502) eval_epoch_loss=tensor(0.1399)


100%|██████████| 255/255 [2:02:41<00:00, 28.87s/it]   
100%|██████████| 29/29 [35:25<00:00, 73.30s/it]  


epoch=1: train_ppl=tensor(1.2987) train_epoch_loss=tensor(0.2614) eval_ppl=tensor(1.1209) eval_epoch_loss=tensor(0.1141)


100%|██████████| 255/255 [25:43:58<00:00, 363.29s/it]    
100%|██████████| 29/29 [3:04:43<00:00, 382.19s/it]   


epoch=2: train_ppl=tensor(1.2599) train_epoch_loss=tensor(0.2310) eval_ppl=tensor(1.1114) eval_epoch_loss=tensor(0.1056)


 80%|███████▉  | 203/255 [93:14:58<35:03:29, 2427.11s/it] 