This is a notebook for p-tuning SST-2 Dataset

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import get_peft_model, PrefixTuningConfig, TaskType, PromptTuningConfig, PromptTuningInit
import torch
from datasets import load_dataset
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup

In [4]:
device = "cuda"
model_name_or_path = "google/t5-v1_1-xl"
tokenizer_name_or_path = "google/t5-v1_1-xl"

batch_size = 16


In [None]:
# peft_config = PromptTuningConfig(
#     task_type=TaskType.SEQ_2_SEQ_LM,
#     model_name_or_path=model_name_or_path,
#     tokenizer_name_or_path=tokenizer_name_or_path,
#     prompt_init=PromptTuningInit.TEXT,
#     prompt_tuning_init_text=PromptTuningInit.RANDOM,
#     num_virtual_tokens=64
# )
# model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()


In [5]:
# creating model
peft_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, 
    inference_mode=False, 
    num_virtual_tokens=256
    )

model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model



trainable params: 25,165,824 || all params: 2,874,923,008 || trainable%: 0.875356450589163


PeftModelForSeq2SeqLM(
  (base_model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 2048)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 2048)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=2048, out_features=2048, bias=False)
                (k): Linear(in_features=2048, out_features=2048, bias=False)
                (v): Linear(in_features=2048, out_features=2048, bias=False)
                (o): Linear(in_features=2048, out_features=2048, bias=False)
                (relative_attention_bias): Embedding(32, 32)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=2048, out_features=5120, bias=False)
      

In [6]:
## Load the sst-2 dataset

dataset = load_dataset("glue", "sst2")
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [7]:
text_column = "sentence"
label_column = "text_label"
max_length = 384

label_mapping = {0: "negative", 1: "positive"}

classes = dataset["train"].features["label"].names

dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True,
    num_proc=1
)

## Add a prompt to each sentence
# init_prompt = """Classify the sentiment of the following sentence as positive or negative.
# Sentence: """

# dataset = dataset.map(
#     lambda x: {"input_text": [(init_prompt + text).strip() for text in x["sentence"]]},
#     batched=True,
#     num_proc=1
# )

dataset["train"][0]
# print(dataset["train"].features["label"].names)

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'text_label': 'negative'}

In [8]:
tokenizer = T5Tokenizer.from_pretrained(tokenizer_name_or_path)

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Running tokenizer on dataset:   0%|          | 0/67349 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/872 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [9]:
# optimizer and lr scheduler
lr = 1e-4
num_epochs = 2
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [10]:
from accelerate import Accelerator
accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

model.device


device(type='cuda', index=0)

In [11]:
from rich.progress import Progress
from rich.console import Console

console = Console()

with Progress() as progress:
    task = progress.add_task("[red]Training...", total=num_epochs)

    for epoch in range(num_epochs):
        epoch_task = progress.add_task(f"Epoch {epoch}", total=len(train_dataloader))
        model.train()
        losses = []
        for step, batch in enumerate(train_dataloader):
            # print(batch["input_ids"].shape)
            # print(batch["labels"].shape)
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            losses.append(loss.item())
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()        
            progress.update(epoch_task, advance=1)
            # progress.print(f"loss: {loss.item()}", end="\r")

        test_task = progress.add_task(f"Epoch {epoch}", total=len(eval_dataloader))
        model.eval()
        for batch in eval_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            progress.update(test_task, advance=1)
            # progress.print(f"eval_loss: {loss.item()}", end="\r")
            

        progress.update(task, advance=1)
        progress.print(f"epoch: {epoch} loss: {sum(losses) / len(losses)}")
        model.save_pretrained("t5-efficient-xxl-prefix-tuning-sst2-prompt")

Output()

In [None]:
model.save_pretrained("t5-efficient-xxl-prefix-tuning-sst2-prompt")

In [2]:
import torch
torch.cuda.empty_cache()

In [12]:
## Test on 1 sample

def generate_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs)
        
    return tokenizer.decode(output[0], skip_special_tokens=True)

input = """I Love this movie"""
output = generate_text(input)
output



''

In [None]:
dataset["validation"][0]

In [None]:
from tqdm import tqdm
pred = []
true = []
for item in tqdm(dataset["validation"]):
    prompt = item["sentence"]
    output = generate_text(prompt)
    pred.append(output)
    true.append(item["text_label"])
    

In [None]:
set(pred)

# pred = [p.lower() for p in pred]

In [None]:
imdb_dataset = load_dataset("imdb")
imdb_dataset

In [None]:
imdb_dataset["test"][0]

In [None]:
label_mapping = {0: "negative", 1: "positive"}


pred = []
true = []

for idx, item in tqdm(enumerate(imdb_dataset["test"])):
    text = item["text"]
    label = label_mapping[item["label"]]
    output = generate_text(text)
    pred.append(output)
    true.append(label)

    if idx == 1000:
        break
    

In [None]:
## SST-2 VAL SET

from sklearn.metrics import accuracy_score
accuracy_score(true, pred)

In [None]:
## IMDB TEST SET
from sklearn.metrics import accuracy_score
accuracy_score(true, pred)

In [None]:
financial_phrasebank = load_dataset("financial_phrasebank", "sentences_allagree")
financial_phrasebank

In [None]:
financial_phrasebank["train"][0]

for idx, item in tqdm(enumerate(financial_phrasebank["train"])):
    text = item["sentence"]
    if item["label"] != 2:
        label = label_mapping[item["label"]]
        output = generate_text(text)
        pred.append(output)
        true.append(label)


In [None]:
accuracy_score(true, pred)

In [None]:
true = []
pred = []

validation_output = {}
validation_output["sentence"] = []
validation_output["true"] = []
validation_output["pred"] = []
for item in dataset["validation"]:
    input = item["sentence"]
    target = item["text_label"]
    output = generate_text(input)
    validation_output["sentence"].append(input)
    validation_output["true"].append(target)
    validation_output["pred"].append(output)

import pandas as pd

df = pd.DataFrame(validation_output)

df.to_csv("t5-large-prefix-tuning-sst2_validation.csv", index=False)

In [None]:
# Confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(true, pred, labels=["positive", "negative"])

In [None]:
## Accuracy -- Accuracy on the Validation Set

from sklearn.metrics import accuracy_score

accuracy_score(true, pred)

In [None]:
test_set_output = {}
test_set_output["sentence"] = []
test_set_output["predicted"] = []

for item in dataset["test"]:
    input = item["sentence"]
    output = generate_text(input)
    test_set_output["sentence"].append(input)
    test_set_output["predicted"].append(output)

import pandas as pd

df = pd.DataFrame(test_set_output)
df.to_csv("test_set_output.csv", index=False)

In [None]:
for idx, row in df.iterrows():
    if row["true"] != row["pred"]:
        print(row["sentence"])
        print(f"true: {row['true']}, predicted: {row['pred']}")
        print("-"*50)

In [None]:
## Base Model Accuracy
from tqdm import tqdm
device = "cuda"
model_name_or_path = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
model.to(device)

def generate_text(text):

    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_length=2, do_sample=False)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# true = []
# pred = []
# for item in tqdm(dataset["validation"]):
#     input = item["sentence"]
#     target = item["text_label"]
#     output = generate_text(input)
#     true.append(target)
#     pred.append(output)

In [None]:
model

In [None]:
from tqdm import tqdm

prompt_template = """Classify the sentiment of the given sentence as positive or negative 
sentence: {sentence} 
Answer: """

pred = []
true = []

for item in tqdm(dataset["validation"]):
    text = item["sentence"]
    prompt = prompt_template.format(sentence=text)
    output = generate_text(prompt)
    pred.append(output)
    true.append(item["text_label"])

In [None]:
set(pred)

In [None]:
text = dataset["validation"][0]["sentence"]
prompt = prompt_template.format(sentence=text)
prompt

In [None]:
input = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**input)
tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
## Map "Fal" to "negative" and "Tru" to "positive"

label_mapping = {
    "Fal": "negative",
    "True": "positive"
}

pred = [label_mapping[item] for item in pred]

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(true, pred)

In [None]:
torch.cuda.empty_cache()

In [None]:
# import torch
# import deepspeed
# from rich.progress import Progress
# from rich.console import Console

# console = Console()

# # Import DeepSpeed and add configuration
# ds_config = "ds_config.json"  # Path to the DeepSpeed config file

# # Wrap the model with DeepSpeed
# model, optimizer, _, lr_scheduler = deepspeed.initialize(
#     model=model,
#     optimizer=optimizer,
#     lr_scheduler=lr_scheduler,
#     config=ds_config,
#     model_parameters=model.parameters()
# )

# with Progress() as progress:
#     train_task = progress.add_task("[red]Training...", total=num_epochs)
#     for epoch in range(num_epochs):
#         epoch_task = progress.add_task(f"Epoch {epoch + 1}/{num_epochs}", total=len(train_dataloader))
#         model.train()
        
#         for step, batch in enumerate(train_dataloader):
#             batch = {k: v.to(model.device) for k, v in batch.items()}  # Move batch to the correct device
#             outputs = model(**batch)
#             loss = outputs.loss
#             model.backward(loss)   # DeepSpeed manages backward and optimizer step
#             model.step()           # Executes DeepSpeed's step
            
#             progress.update(epoch_task, advance=1)

#         # Evaluation step
#         model.eval()
#         with torch.no_grad():
#             for step, batch in enumerate(eval_dataloader):
#                 batch = {k: v.to(model.device) for k, v in batch.items()}
#                 outputs = model(**batch)
#                 eval_loss = outputs.loss
#                 progress.update(train_task, advance=1)
        
#         progress.update(train_task, completed=num_epochs)
#         model.save_checkpoint("sst2-t5-xl-prefix-tuning", tag=f"epoch_{epoch}")

#         console.log(f"Epoch: {epoch} -- Train Loss: {loss.item()} -- Eval Loss: {eval_loss.item()}")
