<a href="https://colab.research.google.com/github/avijit-mukherjee-25/llm/blob/main/BERT_finetune_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install libraries
!pip install transformers torch numpy datasets evaluate

In [None]:
!pip install peft

In [None]:
!pip install wget

In [None]:
import torch, datasets
import torch

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Get Data

In [None]:
import wget
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

# Download the file (if we haven't already)
if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip')

In [None]:
# Unzip the dataset (if we haven't already)
if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

In [None]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
df = df[['label','sentence']].copy()

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

In [None]:
df.shape

In [None]:
import numpy as np
np.random.seed(1337)
idx = np.random.random(size=df.shape[0])<0.8
df_train = df.iloc[idx]
df_test = df.iloc[~idx]
print (df_train.shape, df_test.shape)

## Prepare Data

Approach 1

In [None]:
from transformers import AutoTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", do_lower_case=True)

# # Alternatively, we could have used BertTokenizer directly
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# Print the original sentence.
sentences = df.sentence.values
labels = df.label.values

print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

# encode
print (tokenizer.encode(sentences[0], add_special_tokens=True))

# special tokens
print (tokenizer.convert_ids_to_tokens(101), tokenizer.convert_ids_to_tokens(102))

print (
    tokenizer.encode_plus(
        sentences[0],
        add_special_tokens=True,
        max_length=64,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
)

In [None]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [None]:
# set max_len to 64
max_len = 64
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

## Prepare Data

Approach 2 (Using Transformers Dataset class)

In [None]:
from datasets import Dataset
train_data = Dataset.from_pandas(df_train)
# # alternatively we could have using from_dict as follows
# data = Dataset.from_dict({"sentence": sentences, "label": labels})
train_data

In [None]:
test_data = Dataset.from_pandas(df_test)
test_data

In [None]:
train_data[0], test_data[0]

In [None]:
def tokenize_function(_data):
    return tokenizer(_data['sentence'], padding="max_length", truncation=True)

In [None]:
train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

In [None]:
train_data = train_data.rename_columns({"label": "labels"})
test_data = test_data.rename_column("label", "labels")
train_data, test_data

## Train using Pytorch Trainer

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2, torch_dtype="auto")
model.to(device)

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
import numpy as np
import evaluate

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='test_trainer',
    eval_strategy="epoch",
    num_train_epochs=2,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)

In [None]:
# trainer.train()

## Finetune using native Pytorch

In [None]:
from accelerate.utils.memory import clear_device_cache
del model
del trainer
clear_device_cache()

In [None]:
train_data.set_format('torch')
test_data.set_format('torch')

In [None]:
# create dataloader
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=32)
test_dataloader = DataLoader(test_data, shuffle=True, batch_size=32)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)
model.to(device)

In [None]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-4)

In [None]:
from transformers import get_scheduler
num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
def train(model, optimizer, lr_scheduler, train_dataloader):
    train_loss = 0.0
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        input_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        result = model(input_ids,
               token_type_ids=None,
               attention_mask=input_mask,
               labels=labels,
               return_dict=True)
        loss = result.loss
        logits = result.logits
        train_loss += loss.item()

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if step%100==0:
            print (f'training loss at {step} is {train_loss}')
    return train_loss

In [None]:
import evaluate
metric = evaluate.load("accuracy")

@torch.no_grad()
def eval(model, test_dataloader):
    model.eval()
    val_loss = 0.0
    for _, batch in enumerate(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        input_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        result = model(input_ids,
               token_type_ids=None,
               attention_mask=input_mask,
               labels=labels,
               return_dict=True)
        loss = result.loss
        val_loss += loss.item()
        logits = result.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=labels)
    print (f'validation loss is {val_loss}')
    val_accuracy = metric.compute()
    model.train()
    return val_accuracy

In [None]:
# for epoch in range(num_epochs):
#     train_loss = train(model, optimizer, lr_scheduler, train_dataloader)
#     print (f'train loss at epoch {epoch} is {train_loss}')
#     val_accuracy = eval(model, test_dataloader)
#     print (f'validation accuracy at epoch {epoch} is {val_accuracy}')

## Finetune (native PyTorch) BERT using LoRA

In [None]:
# from accelerate.utils.memory import clear_device_cache
# del model
# del trainer
# clear_device_cache()

In [None]:
train_data.set_format('torch')
test_data.set_format('torch')

In [None]:
# create dataloader
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=32)
test_dataloader = DataLoader(test_data, shuffle=True, batch_size=32)

In [None]:
from peft import LoraConfig, TaskType,  get_peft_config, get_peft_model

In [None]:
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2, torch_dtype="auto")
model = get_peft_model(model, peft_config)

In [None]:
model.print_trainable_parameters()

In [None]:
model.to(device)

In [None]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler
num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
def train(model, optimizer, lr_scheduler, train_dataloader):
    train_loss = 0.0
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        input_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        result = model(input_ids,
               token_type_ids=None,
               attention_mask=input_mask,
               labels=labels,
               return_dict=True)
        loss = result.loss
        logits = result.logits
        train_loss += loss.item()

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if step%100==0:
            print (f'training loss at {step} is {train_loss}')
    return train_loss

In [None]:
import evaluate
metric = evaluate.load("accuracy")

@torch.no_grad()
def eval(model, test_dataloader):
    model.eval()
    val_loss = 0.0
    for _, batch in enumerate(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        input_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        result = model(input_ids,
               token_type_ids=None,
               attention_mask=input_mask,
               labels=labels,
               return_dict=True)
        loss = result.loss
        val_loss += loss.item()
        logits = result.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=labels)
    print (f'validation loss is {val_loss}')
    val_accuracy = metric.compute()
    model.train()
    return val_accuracy

In [None]:
for epoch in range(num_epochs):
    train_loss = train(model, optimizer, lr_scheduler, train_dataloader)
    print (f'train loss at epoch {epoch} is {train_loss}')
    val_accuracy = eval(model, test_dataloader)
    print (f'validation accuracy at epoch {epoch} is {val_accuracy}')

## Finetune BERT-LoRA using Trainer

In [None]:
# from accelerate.utils.memory import clear_device_cache
# del model
# del trainer
# clear_device_cache()

In [None]:
train_data.set_format('torch')
test_data.set_format('torch')

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2, torch_dtype="auto")

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

In [None]:
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_dropout=0.1)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
import numpy as np
import evaluate

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='test_trainer',
    eval_strategy="epoch",
    num_train_epochs=2,
    report_to="none",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)

In [None]:
# trainer.train()