<a href="https://colab.research.google.com/github/aniketSanyal/DifferentialPrivacy/blob/main/Full_Fine_Tuning_Prefix_Tuning_Last_Layer_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opacus transformers peft datasets

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "sst2")

In [None]:
from transformers import AutoTokenizer

checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
class Config:
  num_train_epochs = 7
  learning_rate = 1e-3
  n_prompt_tokens = 10
  random_range  = 0.5
  batch_size = 256
  max_grad_norm = 0.1
args = Config()

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["sentence"], max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")


In [None]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.stack([item['labels'] for item in batch])

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
from transformers import AutoModelForSequenceClassification


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
def get_new_soft_prompt_model(num_labels):
  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
  return model


In [None]:
model =get_new_soft_prompt_model(2)

In [None]:
import numpy as np

def accuracy(preds, labels):
    return (preds == labels).mean()

# define evaluation cycle
def evaluate(model):
    model.eval()

    loss_arr = []
    accuracy_arr = []

    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss, logits = outputs[:2]

        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        labels =batch['labels'].detach().cpu().numpy()

        loss_arr.append(loss.item())
        accuracy_arr.append(accuracy(preds, labels))

    model.train()
    return np.mean(loss_arr), np.mean(accuracy_arr)

In [None]:
total_params = 0
for p in model.parameters():
    total_params += p.numel()
print(total_params)

In [None]:
from torch.optim import AdamW

optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
LOGGING_INTERVAL = 100

In [None]:
device= "cuda"

In [None]:
def train_full_finetune(model,optimiser, train_dataloader):
  for epoch in range(1, args.num_train_epochs+1):
    losses = []

    for  step, batch in enumerate(train_dataloader):
            optimiser.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch) # output = loss, logits, hidden_states, attentions
            loss = outputs[0]

            loss.backward()
            losses.append(loss.item())
            optimiser.step()

            if step > 0 and step % LOGGING_INTERVAL == 0:
                train_loss = np.mean(losses)

                eval_loss, eval_accuracy = evaluate(model)

                print(
                  f"Epoch: {epoch} | "
                  f"Step: {step} | "
                  f"Train loss: {train_loss:.3f} | "
                  f"Eval loss: {eval_loss:.3f} | "
                  f"Eval accuracy: {eval_accuracy:.3f} | "
                )

In [None]:
train_full_finetune(model, optimiser, train_dataloader)

FOR QNLI

In [None]:
raw_datasets = load_dataset("glue", "qnli")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation'] = raw_datasets['validation'].select([i for i in range(5000)])

raw_datasets['test'] = raw_datasets['test'].select([i for i in range(5000)])

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["question"],example["sentence"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'sentence', 'question'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
model = get_new_soft_prompt_model(2)
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
train_full_finetune(model, optimiser, train_dataloader)

QQP

In [None]:
raw_datasets = load_dataset("glue", "qqp")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation'] = raw_datasets['validation'].select([i for i in range(5000)])

raw_datasets['test'] = raw_datasets['test'].select([i for i in range(5000)])

In [None]:
raw_datasets['train'][:5]

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["question1"],example["question2"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'question1', 'question2'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
model = get_new_soft_prompt_model(2)
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
train_full_finetune(model, optimiser, train_dataloader)

For MNLI

In [None]:
raw_datasets = load_dataset("glue", "mnli")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation_matched'] = raw_datasets['validation_matched'].select([i for i in range(5000)])


In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, num_labels =3)

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["premise"],example["hypothesis"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'premise', 'hypothesis'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation_matched"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
model = get_new_soft_prompt_model(3)
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
model.to("cuda")

In [None]:
train_full_finetune(model, optimiser, train_dataloader)

Last Layer Fine Tuning

In [None]:
raw_datasets = load_dataset("glue", "sst2")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])


In [None]:
model =get_new_soft_prompt_model(2)

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["sentence"], max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
trainable_params = 0
for i, p in enumerate(model.parameters()):
  if i==40  or i==39:
    trainable_params += p.numel()
    p.requires_grad =True
  else:
    p.requires_grad = False

In [None]:
trainable_params

In [None]:
list(model.parameters())

In [None]:
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
model.to("cuda")

In [None]:
train_full_finetune(model,optimiser, train_dataloader)

In [None]:
del model

QQP

In [None]:
raw_datasets = load_dataset("glue", "qqp")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation'] = raw_datasets['validation'].select([i for i in range(5000)])

raw_datasets['test'] = raw_datasets['test'].select([i for i in range(5000)])

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["question1"],example["question2"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'question1', 'question2'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
model =get_new_soft_prompt_model(2)
model.to("cuda")

In [None]:
trainable_params = 0
for i, p in enumerate(model.parameters()):
  if i==40  or i==39:
    trainable_params += p.numel()
    p.requires_grad =True
  else:
    p.requires_grad = False

In [None]:
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
train_full_finetune(model,optimiser, train_dataloader)

QNLI

In [None]:
raw_datasets = load_dataset("glue", "qnli")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation'] = raw_datasets['validation'].select([i for i in range(5000)])

raw_datasets['test'] = raw_datasets['test'].select([i for i in range(5000)])

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["question"],example["sentence"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'sentence', 'question'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
model =get_new_soft_prompt_model(2)
model.to("cuda")

In [None]:
trainable_params = 0
for i, p in enumerate(model.parameters()):
  if i==40  or i==39:
    trainable_params += p.numel()
    p.requires_grad =True
  else:
    p.requires_grad = False

In [None]:
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
train_full_finetune(model,optimiser, train_dataloader)

MNLI

In [None]:
raw_datasets = load_dataset("glue", "mnli")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation_matched'] = raw_datasets['validation_matched'].select([i for i in range(5000)])


In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, num_labels =3)

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["premise"],example["hypothesis"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'premise', 'hypothesis'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation_matched"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
model = get_new_soft_prompt_model(3)
trainable_params = 0
for i, p in enumerate(model.parameters()):
  if i==40  or i==39:
    trainable_params += p.numel()
    p.requires_grad =True
  else:
    p.requires_grad = False
model.to("cuda")
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
train_full_finetune(model,optimiser, train_dataloader)

In [None]:
for epoch in range(1, 4):
    losses = []

    for  step, batch in enumerate(train_dataloader):
            optimiser.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch) # output = loss, logits, hidden_states, attentions

            loss = outputs[0]

            loss.backward()
            losses.append(loss.item())
            optimiser.step()

            if step > 0 and step % LOGGING_INTERVAL == 0:
                train_loss = np.mean(losses)

                eval_loss, eval_accuracy = evaluate(model)

                print(
                  f"Epoch: {epoch} | "
                  f"Step: {step} | "
                  f"Train loss: {train_loss:.3f} | "
                  f"Eval loss: {eval_loss:.3f} | "
                  f"Eval accuracy: {eval_accuracy:.3f} | "
                )

PREFIX TUNING

In [None]:
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "sst2")

In [None]:
from transformers import AutoTokenizer

checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, num_labels=2)

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["sentence"], max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")


In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
del model
del optimiser

In [None]:
from transformers import  AutoModelForSeq2SeqLM

peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, num_virtual_tokens=10)
model = get_new_soft_prompt_model(2)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
optimiser = AdamW(model.parameters(), lr=args.learning_rate)

In [None]:
model.to("cuda")

In [None]:
train_full_finetune(model,optimiser, train_dataloader)

For QNLI

In [None]:
raw_datasets = load_dataset("glue", "qnli")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation'] = raw_datasets['validation'].select([i for i in range(5000)])

raw_datasets['test'] = raw_datasets['test'].select([i for i in range(5000)])

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["question"],example["sentence"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'sentence', 'question'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, num_virtual_tokens=10)
model = get_new_soft_prompt_model(2)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
optimiser = AdamW(model.parameters(), lr=args.learning_rate)
model.to("cuda")

In [None]:
train_full_finetune(model,optimiser, train_dataloader)

QQP

In [None]:
raw_datasets = load_dataset("glue", "qqp")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation'] = raw_datasets['validation'].select([i for i in range(5000)])

raw_datasets['test'] = raw_datasets['test'].select([i for i in range(5000)])

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["question1"],example["question2"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'question1', 'question2'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, num_virtual_tokens=10)
model = get_new_soft_prompt_model(2)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
optimiser = AdamW(model.parameters(), lr=args.learning_rate)
model.to("cuda")

In [None]:
train_full_finetune(model,optimiser, train_dataloader)

MNLI

In [None]:
raw_datasets = load_dataset("glue", "mnli")

In [None]:
raw_datasets['train'] = raw_datasets['train'].select([i for i in range(50000)])

raw_datasets['validation_matched'] = raw_datasets['validation_matched'].select([i for i in range(5000)])


In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, num_labels =3)

In [None]:
from torch.utils.data import DataLoader
tokenized_dataset = raw_datasets.map(
    lambda example: tokenizer(example["premise"],example["hypothesis"] ,max_length=64, padding='max_length', truncation=True),
    batched=True
)


tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx', 'premise', 'hypothesis'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)
test_dataloader = DataLoader(tokenized_dataset["validation_matched"], shuffle=False, batch_size=args.batch_size, collate_fn=custom_collate)

In [None]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, num_virtual_tokens=10)
model = get_new_soft_prompt_model(3)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.to("cuda")

In [None]:
train_full_finetune(model,optimiser, train_dataloader)