## Overview

Tried different PEFT approaches to update pretrained model t5-small with low-resources (free GPU from Google Colab)

In [None]:
!pip install datasets transformers peft

In [2]:
# pip install peft==0.14.0
# pip install transformers==4.50.3 # had to change dependecies to make the imports work

# import the required libraries
import torch
import datasets
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer,AutoModelForSeq2SeqLM
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType, PrefixTuningConfig, PromptEncoderConfig
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed
from IPython.display import display, HTML
import time
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

In [3]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [4]:
# loading dataset
dataset = load_dataset("imdb", split=['train', 'test', 'unsupervised'])
dataset = datasets.DatasetDict({"train": dataset[0],"test": dataset[1], "unsupervised": dataset[2]})
dataset["train"], dataset["validation"] = dataset["train"].train_test_split(test_size=0.2, seed=42).values()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Prompt Tuning

In [5]:
#Step 1: repeating the necessary imports to make code moudlar to each question. installations of library is done and mentioned in cell before
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from peft import get_peft_model, PromptTuningConfig, TaskType

# setting up seeds to that some reproducbility is achieved if i run it again
torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # setting up gpu if available else cpu

#Step 2: loading the dataset mentioned in instructions - imdb again just again to be modular, and tokenizing using my udf mentioned later in code
# dataloader are defined down in the code where i have my collate function as well
dataset=load_dataset("imdb", split=['train', 'test', 'unsupervised'])
dataset={"train":dataset[0],"test":dataset[1],"unsupervised": dataset[2]}
dataset["train"], dataset["validation"]=dataset["train"].train_test_split(test_size=0.2,seed=42).values() # spliting data for vali
# # extra code lines i used to test the code on small set of data for quick running of loops/epochs
# dataset["train"] = dataset["train"].select(range(100))
# dataset["validation"] = dataset["validation"].shuffle().select(range(50))
# dataset["test"] = dataset["test"].shuffle().select(range(50))

# loading tokenizer and model
model_name="t5-small"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# this dict is used to keep the token id of the predictions and later used in evaluate funtion
classs_tokens={"negative":tokenizer("negative",return_tensors="pt").input_ids[0,0].item(),
    "positive":tokenizer("positive",return_tensors="pt").input_ids[0, 0].item()}

# step 3:
#soft prompt tuning config : here i have used 10 tokens and the peft library initializes the weights randomly, which i try to control by torch.seed i did in start
peft_config=PromptTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM,
    prompt_tuning_init="TEXT", # starting with some text in inputs
    prompt_tuning_init_text="Classify sentiment as positive or negative:", # this is the starting
    num_virtual_tokens=10, # should be a good start for 10 tokens
    tokenizer_name_or_path=model_name,)

# wrapping the model in the peft model using the above configs
model=get_peft_model(model,peft_config)
model.print_trainable_parameters() # printing the traininable params and percentage
# # printing the trainable layers name | commented for now
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name)

# tokenization udf (part of step 2)
def tokenize_function(examples):
    inputs=[f"Classify sentiment: {text}" for text in examples["text"]] # adding a promt text in start of inputs
    model_inputs=tokenizer(inputs,
        padding="max_length", # keeping max length to 256 for now. dont need too long as we can get desirable acc with this
        truncation=True,max_length=256,)

    targets=["positive" if label == 1 else "negative" for label in examples["label"]] # making the labels text for deconder
    labels=tokenizer(targets,padding="max_length",truncation=True,
        max_length=5,) # we dont have much tokens needed hence keeping this small

    model_inputs["labels"]=labels["input_ids"]
    return model_inputs

#tokenizing  datasets
tokenized_datasets={split: dataset[split].map(tokenize_function,batched=True,remove_columns=["text", "label"]) for split in dataset.keys()}

# collate function for dataloader (asked the TAs and they its preferred to use custom dataloader hence doing this setup)
def collate_fn(batch):
    input_ids=torch.tensor([x["input_ids"] for x in batch],dtype=torch.long)
    attention_mask=torch.tensor([x["attention_mask"] for x in batch],dtype=torch.long) # found some mismatches in different runs, hence changing dtype explicitly here
    labels=torch.tensor([x["labels"] for x in batch], dtype=torch.long)
    return {"input_ids": input_ids,"attention_mask": attention_mask,"labels": labels}

train_loader= DataLoader(tokenized_datasets["train"],batch_size=8, shuffle=True,collate_fn=collate_fn) # shuffling just the train data
val_loader=DataLoader(tokenized_datasets["validation"],batch_size=8,collate_fn=collate_fn)
test_loader=DataLoader(tokenized_datasets["test"],batch_size=8,collate_fn=collate_fn)

# step 4
# defining the optimizer and loss  function and a small learning rate for now
optimizer= torch.optim.Adam(model.parameters(),lr=1e-3) # no worries in passing model.params as all other params are frozedn- verified
loss_fn=torch.nn.CrossEntropyLoss()

# udf to get logits  for tokens from custom dict for classes
def get_class_logits(outputs):
    return outputs.logits[:,0,[classs_tokens["negative"],classs_tokens["positive"]]]

#training loop
def train(num_epochs=3): # bu default kept 3 epochs
    for epoch in range(num_epochs):
        model.train()
        total_loss,correct,total=0,0,0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
            # sending the batch data into the model by making input dict object
            inputs = {"input_ids":batch["input_ids"].to(device),"attention_mask":batch["attention_mask"].to(device),"labels":batch["labels"].to(device)}
            # getting outputs from model in forward pass
            outputs=model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"],labels=inputs["labels"])
            # getting class probas using custom token dict for class
            class_logits=get_class_logits(outputs)
            targets=(batch["labels"][:, 0] == classs_tokens["positive"]).long().to(device)
            # calculating loss
            loss=loss_fn(class_logits, targets)
            optimizer.zero_grad()
            loss.backward() # backprop
            optimizer.step()
            total_loss +=loss.item()
            correct+=(class_logits.argmax(1) == targets).sum().item()
            total+= targets.size(0)
        # calculating loss metrics for reporting
        val_acc,val_loss=evaluate(val_loader)
        train_acc= 100*correct /total
        avg_loss=total_loss/len(train_loader)
        # reporting epoch progress
        print(f"Epoch {epoch+1}:")
        print(f"Train Loss: {avg_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

# step 5 udf
# Evaluation function quite same as train evaluation process
def evaluate(data_loader):
    model.eval()
    total_loss,correct,total = 0,0,0 # metrics variables for accuracy (acc) reporting

    with torch.no_grad(): # no grad as its eval
        for batch in data_loader:
            # sending the batch data into the model by making input dict object
            inputs={"input_ids": batch["input_ids"].to(device),"attention_mask": batch["attention_mask"].to(device),"labels": batch["labels"].to(device)}
            # getting outputs from model
            outputs=model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"],labels=inputs["labels"])
            # finding the proba for the tokens using my custom dict
            class_logits=get_class_logits(outputs)
            targets=(batch["labels"][:,0] == classs_tokens["positive"]).long().to(device)
            # calculating loss and acc metrics for reporting
            loss=loss_fn(class_logits,targets)
            total_loss+=loss.item()
            correct+=(class_logits.argmax(1) == targets).sum().item()
            total+=targets.size(0)
    return 100 * correct / total, total_loss / len(data_loader)

# step 4 and 5 calls
# Train and evaluate
train(num_epochs=3)
test_acc,test_loss=evaluate(test_loader)
print(f"\nTest Accuracy: {test_acc:.2f}%")# final accuracy results
print(f"Test Loss: {test_loss:.4f}")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 10,240 || all params: 60,516,864 || trainable%: 0.0169


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Epoch 1:   0%|          | 0/2500 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1: 100%|██████████| 2500/2500 [04:23<00:00,  9.47it/s]


Epoch 1:
Train Loss: 0.6317 | Train Acc: 64.86%
Val Loss: 0.6081 | Val Acc: 70.94%


Epoch 2: 100%|██████████| 2500/2500 [04:10<00:00,  9.98it/s]


Epoch 2:
Train Loss: 0.6235 | Train Acc: 65.91%
Val Loss: 0.6015 | Val Acc: 71.50%


Epoch 3: 100%|██████████| 2500/2500 [04:13<00:00,  9.84it/s]


Epoch 3:
Train Loss: 0.6214 | Train Acc: 66.08%
Val Loss: 0.5962 | Val Acc: 71.76%

Test Accuracy: 73.01%
Test Loss: 0.5900


In [6]:
model.save_pretrained("soft_prompt_checkpoint")
tokenizer.save_pretrained("soft_prompt_checkpoint")

('soft_prompt_checkpoint/tokenizer_config.json',
 'soft_prompt_checkpoint/special_tokens_map.json',
 'soft_prompt_checkpoint/spiece.model',
 'soft_prompt_checkpoint/added_tokens.json',
 'soft_prompt_checkpoint/tokenizer.json')

In [7]:
import shutil
shutil.make_archive('soft_prompt_checkpoint','zip','soft_prompt_checkpoint')
from google.colab import files
files.download('soft_prompt_checkpoint.zip') # downloading this from colab (where i had gpu) so that i have it in my local system for demo if needed

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
!pip install adapters

Collecting adapters
  Downloading adapters-1.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers~=4.47.1 (from adapters)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading adapters-1.1.0-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, adapters
  Attempting uninstall: transformers
    Found existing installation: transformers 4.50.3
    Uninstalling transformers-4.50.3:
      Successfully uninstalled transformers-4.50.3
Successfully installed adapters-1.1.0 transformers-4.47.1


# Adapters Based

In [29]:
#Step 1: repeating the necessary imports to make code moudlar to each question. installations of library is done and mentioned in cell before
import torch
import numpy as np
from adapters import AutoAdapterModel, AdapterConfig
from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

# setting up seeds to that some reproducbility is achieved if i run it again
torch.manual_seed(42)
np.random.seed(42)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu") # setting up gpu if available else cpu

#Step 2: loading the dataset mentioned in instructions - imdb again just again to be modular, and tokenizing using my udf mentioned later in code
# dataloader are defined down in the code where i have my collate function as well
dataset=load_dataset("imdb", split=['train', 'test', 'unsupervised'])
dataset={"train":dataset[0],"test":dataset[1],"unsupervised": dataset[2]}
dataset["train"], dataset["validation"]=dataset["train"].train_test_split(test_size=0.2,seed=42).values() # spliting data for vali
# # extra code lines i used to test the code on small set of data for quick running of loops/epochs
# dataset["train"] = dataset["train"].select(range(100))
# dataset["validation"] = dataset["validation"].shuffle().select(range(50))
# dataset["test"] = dataset["test"].shuffle().select(range(50))

# load model and tokenizer
model_name="t5-small"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoAdapterModel.from_pretrained(model_name).to(device)

# this dict is used to keep the token id of the predictions and later used in evaluate funtion
classs_tokens={"negative":tokenizer("negative",return_tensors="pt").input_ids[0,0].item(),
    "positive":tokenizer("positive",return_tensors="pt").input_ids[0, 0].item()}

# step 3:
#adapter config
adapter_config=AdapterConfig.load("pfeiffer",  # using this architecture as it does not add more layers after each ffn, thus saving compute
reduction_factor=16, # deciding how much should be the hidden layers size (512/reduction_factor)- tried couple of factors, decided on this finally
    non_linearity="relu") # picked relu because of it good handling of vanishing gradients)

# adding this config adapter to the model and naming the task we doing in imdb data
model.add_adapter("sentiment_task",config=adapter_config)
model.train_adapter("sentiment_task") # freezing all weights except adapters
model.set_active_adapters("sentiment_task") # we have just one adapter for now, but one of the tehory questions mentoend of multi-lingual tasks for which one may need multiple adapters, and hence that is when this comes in handy
model=model.to(device)
# printing trainable parameters for some clarity
trainable_params=0
all_params=0
for _, param in model.named_parameters():
    all_params +=param.numel()
    if param.requires_grad:
        trainable_params +=param.numel()
print(f"trainable params: {trainable_params:,} | all params: {all_params:,} | trainable_perc: {100 * trainable_params / all_params:.2f}")
# # printing the trainable layers name | commented for now
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name)

# tokenization udf (part of step 2)
def tokenize_function(examples):
    inputs=[f"Classify sentiment: {text}" for text in examples["text"]] # adding a promt text in start of inputs
    model_inputs=tokenizer(inputs,
        padding="max_length", # keeping max length to 256 for now. dont need too long as we can get desirable acc with this
        truncation=True,max_length=256,)

    targets=["positive" if label == 1 else "negative" for label in examples["label"]] # making the labels text for deconder
    labels=tokenizer(targets,padding="max_length",truncation=True,
        max_length=5,) # we dont have much tokens needed hence keeping this small

    model_inputs["labels"]=labels["input_ids"]
    return model_inputs

#tokenizing  datasets
tokenized_datasets={split: dataset[split].map(tokenize_function,batched=True,remove_columns=["text", "label"]) for split in dataset.keys()}

# collate function for dataloader (asked the TAs and they its preferred to use custom dataloader hence doing this setup)
def collate_fn(batch):
    input_ids=torch.tensor([x["input_ids"] for x in batch],dtype=torch.long)
    attention_mask=torch.tensor([x["attention_mask"] for x in batch],dtype=torch.long) # found some mismatches in different runs, hence changing dtype explicitly here
    labels=torch.tensor([x["labels"] for x in batch], dtype=torch.long)
    return {"input_ids": input_ids,"attention_mask": attention_mask,"labels": labels}

train_loader= DataLoader(tokenized_datasets["train"],batch_size=8, shuffle=True,collate_fn=collate_fn) # shuffling just the train data
val_loader=DataLoader(tokenized_datasets["validation"],batch_size=8,collate_fn=collate_fn)
test_loader=DataLoader(tokenized_datasets["test"],batch_size=8,collate_fn=collate_fn)

# step 4
# defining the optimizer and loss  function and a small learning rate for now
optimizer= torch.optim.Adam(model.parameters(),lr=1e-3) # no worries in passing model.params as all other params are frozedn- verified
loss_fn=torch.nn.CrossEntropyLoss()

# udf to get logits  for tokens from custom dict for classes
def get_class_logits(outputs):
    return outputs.logits[:,0,[classs_tokens["negative"],classs_tokens["positive"]]]

#training loop
def train(num_epochs=3): # bu default kept 3 epochs
    for epoch in range(num_epochs):
        model.train()
        total_loss,correct,total=0,0,0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
            # sending the batch data into the model by making input dict object
            inputs = {"input_ids":batch["input_ids"].to(device),"attention_mask":batch["attention_mask"].to(device),"labels":batch["labels"].to(device)}
            # getting outputs from model in forward pass
            outputs=model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"],labels=inputs["labels"])
            # getting class probas using custom token dict for class
            class_logits=get_class_logits(outputs)
            targets=(batch["labels"][:, 0] == classs_tokens["positive"]).long().to(device)
            # calculating loss
            loss=loss_fn(class_logits, targets)
            optimizer.zero_grad()
            loss.backward() # backprop
            optimizer.step()
            total_loss +=loss.item()
            correct+=(class_logits.argmax(1) == targets).sum().item()
            total+= targets.size(0)
        # calculating loss metrics for reporting
        val_acc,val_loss=evaluate(val_loader)
        train_acc= 100*correct /total
        avg_loss=total_loss/len(train_loader)
        # reporting epoch progress
        print(f"Epoch {epoch+1}:")
        print(f"Train Loss: {avg_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

# step 5 udf
# Evaluation function quite same as train evaluation process
def evaluate(data_loader):
    model.eval()
    total_loss,correct,total = 0,0,0 # metrics variables for accuracy (acc) reporting

    with torch.no_grad(): # no grad as its eval
        for batch in data_loader:
            # sending the batch data into the model by making input dict object
            inputs={"input_ids": batch["input_ids"].to(device),"attention_mask": batch["attention_mask"].to(device),"labels": batch["labels"].to(device)}
            # getting outputs from model
            outputs=model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"],labels=inputs["labels"])
            # finding the proba for the tokens using my custom dict
            class_logits=get_class_logits(outputs)
            targets=(batch["labels"][:,0] == classs_tokens["positive"]).long().to(device)
            # calculating loss and acc metrics for reporting
            loss=loss_fn(class_logits,targets)
            total_loss+=loss.item()
            correct+=(class_logits.argmax(1) == targets).sum().item()
            total+=targets.size(0)
    return 100 * correct / total, total_loss / len(data_loader)

# step 4 and 5 calls
# Train and evaluate
train(num_epochs=3)
test_acc,test_loss=evaluate(test_loader)
print(f"\nTest Accuracy: {test_acc:.2f}%")# final accuracy results
print(f"Test Loss: {test_loss:.4f}")

trainable params: 399,744 | all params: 60,906,368 | trainable_perc: 0.66


Epoch 1: 100%|██████████| 2500/2500 [04:08<00:00, 10.07it/s]


Epoch 1:
Train Loss: 0.3267 | Train Acc: 86.05%
Val Loss: 0.2885 | Val Acc: 87.64%


Epoch 2: 100%|██████████| 2500/2500 [03:51<00:00, 10.82it/s]


Epoch 2:
Train Loss: 0.2685 | Train Acc: 88.96%
Val Loss: 0.2434 | Val Acc: 90.00%


Epoch 3: 100%|██████████| 2500/2500 [03:50<00:00, 10.82it/s]


Epoch 3:
Train Loss: 0.2423 | Train Acc: 90.03%
Val Loss: 0.2685 | Val Acc: 89.22%

Test Accuracy: 89.47%
Test Loss: 0.2660


In [30]:
model.save_adapter("./saved_adapter/","sentiment_task")

In [31]:
import shutil
shutil.make_archive('saved_adapter','zip','saved_adapter')
from google.colab import files
files.download('saved_adapter.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name,param.device) # needed this to debug wwhy I was getting a different device error, saw that adapters weights were in cpu, so had to again put it in device to get gpu

# LoRA

In [32]:
#Step 1: repeating the necessary imports to make code moudlar to each question. installations of library is done and mentioned in cell before
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from peft import get_peft_model,LoraConfig ,TaskType

# setting up seeds to that some reproducbility is achieved if i run it again
torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # setting up gpu if available else cpu

#Step 2: loading the dataset mentioned in instructions - imdb again just again to be modular, and tokenizing using my udf mentioned later in code
# dataloader are defined down in the code where i have my collate function as well
dataset=load_dataset("imdb", split=['train', 'test', 'unsupervised'])
dataset={"train":dataset[0],"test":dataset[1],"unsupervised": dataset[2]}
dataset["train"], dataset["validation"]=dataset["train"].train_test_split(test_size=0.2,seed=42).values() # spliting data for vali
# # extra code lines i used to test the code on small set of data for quick running of loops/epochs
# dataset["train"] = dataset["train"].select(range(100))
# dataset["validation"] = dataset["validation"].shuffle().select(range(50))
# dataset["test"] = dataset["test"].shuffle().select(range(50))

# load model and tokenizer
model_name="t5-small"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# this dict is used to keep the token id of the predictions and later used in evaluate funtion
classs_tokens={"negative":tokenizer("negative",return_tensors="pt").input_ids[0,0].item(),
    "positive":tokenizer("positive",return_tensors="pt").input_ids[0, 0].item()}

# step 3:
# configs for lora
peft_config=LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
inference_mode=False, # keeping inference off as we are training rn
 r=8,# picked 8 for rank for now after trying few
lora_alpha=32, # 32/rank amount of info used
lora_dropout=0.1, # dropout in lora for avoiding overfitting
 target_modules=["v"], # had just value V attention to be tuned for now
)
# getting the peft wrapper model
model=get_peft_model(model,peft_config)
model=model.to(device)
model.print_trainable_parameters() # pritning number of params being trained and percentage
for name, param in model.named_parameters(): # printing names to see which all layers are being trained
    if param.requires_grad:
        print(name)
# tokenization udf (part of step 2)
def tokenize_function(examples):
    inputs=[f"Classify sentiment: {text}" for text in examples["text"]] # adding a promt text in start of inputs
    model_inputs=tokenizer(inputs,
        padding="max_length", # keeping max length to 256 for now. dont need too long as we can get desirable acc with this
        truncation=True,max_length=256,)

    targets=["positive" if label == 1 else "negative" for label in examples["label"]] # making the labels text for deconder
    labels=tokenizer(targets,padding="max_length",truncation=True,
        max_length=5,) # we dont have much tokens needed hence keeping this small

    model_inputs["labels"]=labels["input_ids"]
    return model_inputs

#tokenizing  datasets
tokenized_datasets={split: dataset[split].map(tokenize_function,batched=True,remove_columns=["text", "label"]) for split in dataset.keys()}

# collate function for dataloader (asked the TAs and they its preferred to use custom dataloader hence doing this setup)
def collate_fn(batch):
    input_ids=torch.tensor([x["input_ids"] for x in batch],dtype=torch.long)
    attention_mask=torch.tensor([x["attention_mask"] for x in batch],dtype=torch.long) # found some mismatches in different runs, hence changing dtype explicitly here
    labels=torch.tensor([x["labels"] for x in batch], dtype=torch.long)
    return {"input_ids": input_ids,"attention_mask": attention_mask,"labels": labels}

train_loader= DataLoader(tokenized_datasets["train"],batch_size=8, shuffle=True,collate_fn=collate_fn) # shuffling just the train data
val_loader=DataLoader(tokenized_datasets["validation"],batch_size=8,collate_fn=collate_fn)
test_loader=DataLoader(tokenized_datasets["test"],batch_size=8,collate_fn=collate_fn)

# step 4
# defining the optimizer and loss  function and a small learning rate for now
optimizer= torch.optim.Adam(model.parameters(),lr=1e-3) # no worries in passing model.params as all other params are frozedn- verified
loss_fn=torch.nn.CrossEntropyLoss()

# udf to get logits  for tokens from custom dict for classes
def get_class_logits(outputs):
    return outputs.logits[:,0,[classs_tokens["negative"],classs_tokens["positive"]]]

#training loop
def train(num_epochs=3): # bu default kept 3 epochs
    for epoch in range(num_epochs):
        model.train()
        total_loss,correct,total=0,0,0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
            # sending the batch data into the model by making input dict object
            inputs = {"input_ids":batch["input_ids"].to(device),"attention_mask":batch["attention_mask"].to(device),"labels":batch["labels"].to(device)}
            # getting outputs from model in forward pass
            outputs=model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"],labels=inputs["labels"])
            # getting class probas using custom token dict for class
            class_logits=get_class_logits(outputs)
            targets=(batch["labels"][:, 0] == classs_tokens["positive"]).long().to(device)
            # calculating loss
            loss=loss_fn(class_logits, targets)
            optimizer.zero_grad()
            loss.backward() # backprop
            optimizer.step()
            total_loss +=loss.item()
            correct+=(class_logits.argmax(1) == targets).sum().item()
            total+= targets.size(0)
        # calculating loss metrics for reporting
        val_acc,val_loss=evaluate(val_loader)
        train_acc= 100*correct /total
        avg_loss=total_loss/len(train_loader)
        # reporting epoch progress
        print(f"Epoch {epoch+1}:")
        print(f"Train Loss: {avg_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

# step 5 udf
# Evaluation function quite same as train evaluation process
def evaluate(data_loader):
    model.eval()
    total_loss,correct,total = 0,0,0 # metrics variables for accuracy (acc) reporting

    with torch.no_grad(): # no grad as its eval
        for batch in data_loader:
            # sending the batch data into the model by making input dict object
            inputs={"input_ids": batch["input_ids"].to(device),"attention_mask": batch["attention_mask"].to(device),"labels": batch["labels"].to(device)}
            # getting outputs from model
            outputs=model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"],labels=inputs["labels"])
            # finding the proba for the tokens using my custom dict
            class_logits=get_class_logits(outputs)
            targets=(batch["labels"][:,0] == classs_tokens["positive"]).long().to(device)
            # calculating loss and acc metrics for reporting
            loss=loss_fn(class_logits,targets)
            total_loss+=loss.item()
            correct+=(class_logits.argmax(1) == targets).sum().item()
            total+=targets.size(0)
    return 100 * correct / total, total_loss / len(data_loader)

# step 4 and 5 calls
# Train and evaluate
train(num_epochs=3)
test_acc,test_loss=evaluate(test_loader)
print(f"\nTest Accuracy: {test_acc:.2f}%")# final accuracy results
print(f"Test Loss: {test_loss:.4f}")

trainable params: 147,456 || all params: 60,654,080 || trainable%: 0.2431
base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight
base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight
base_model.model.encoder.block.1.layer.0.SelfAttention.v.lora_A.default.weight
base_model.model.encoder.block.1.layer.0.SelfAttention.v.lora_B.default.weight
base_model.model.encoder.block.2.layer.0.SelfAttention.v.lora_A.default.weight
base_model.model.encoder.block.2.layer.0.SelfAttention.v.lora_B.default.weight
base_model.model.encoder.block.3.layer.0.SelfAttention.v.lora_A.default.weight
base_model.model.encoder.block.3.layer.0.SelfAttention.v.lora_B.default.weight
base_model.model.encoder.block.4.layer.0.SelfAttention.v.lora_A.default.weight
base_model.model.encoder.block.4.layer.0.SelfAttention.v.lora_B.default.weight
base_model.model.encoder.block.5.layer.0.SelfAttention.v.lora_A.default.weight
base_model.model.encoder.block.5.layer.0.SelfAttention.v.

Epoch 1:   0%|          | 0/2500 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1: 100%|██████████| 2500/2500 [04:21<00:00,  9.58it/s]


Epoch 1:
Train Loss: 0.3381 | Train Acc: 85.36%
Val Loss: 0.2783 | Val Acc: 88.44%


Epoch 2: 100%|██████████| 2500/2500 [04:03<00:00, 10.27it/s]


Epoch 2:
Train Loss: 0.2838 | Train Acc: 88.39%
Val Loss: 0.2645 | Val Acc: 88.94%


Epoch 3: 100%|██████████| 2500/2500 [04:05<00:00, 10.16it/s]


Epoch 3:
Train Loss: 0.2675 | Train Acc: 88.94%
Val Loss: 0.2678 | Val Acc: 89.38%

Test Accuracy: 89.96%
Test Loss: 0.2565


In [33]:
model.save_pretrained("./saved_lora_model/")

In [34]:
import shutil
shutil.make_archive('saved_lora_model','zip','saved_lora_model')
from google.colab import files
files.download('saved_lora_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>