## Fine tuning LLM with Prompt fine tuning

### Imports

In [1]:
import torch
import re
from datasets import load_dataset, DatasetDict, Dataset
from torch.utils.data import DataLoader
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorWithPadding,
    AdamW,
    get_scheduler)

from peft import (get_peft_config, 
get_peft_model, 
PromptTuningInit, 
PromptTuningConfig, 
TaskType, 
PeftType,
PeftModel,
PeftConfig)
import evaluate
import nltk
import numpy as np
from tqdm import tqdm

from pynvml import *
import subprocess as sp
import os

In [2]:
# once
nltk.download("punkt", quiet=True)

True

#### Set deivce

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


#### GPU Utilization stats

In [3]:
# print summary statistics for the GPU utilization and the training run
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.total,memory.used --format=csv"
    memory_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    return memory_info

get_gpu_memory()

['11441 MiB, 3 MiB']

#### Initialize constants

In [16]:
# Set values
dataset_name = 'gsm8k'
data_split = 'main'
base_model = "google/flan-t5-large"
task_type = TaskType.SEQ_2_SEQ_LM
prompt_tuning_init = PromptTuningInit.TEXT
num_virtual_tokens = 10
prompt = "Let's first understand the problem and devise a plan to solve the problem. Then, let's carry out the plan to solve the problem step by step. Then, let's answer the question step by step (pay attention to commonsense and logical coherence)."
max_target_length = 100
lr=1e-4
num_epochs = 3
max_length_output=300
train_batch_size = 4
eval_batch_size = 4
test_batch_size = 4
trained_model_path = "model_ckpts/flan/large-Peft-Eval_306"
test_responses_path = "large_peft_eval_306"
num_warmup_steps=0

#### Load Data

In [5]:
# load data
data = load_dataset(dataset_name, data_split)
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

#### Split Data - Train, Eval, Test

In [6]:
# split test into test and eval
test_valid = data['test'].train_test_split(test_size=0.5)
test_valid['train']

data = DatasetDict({
    'train': data['train'],
    'eval': test_valid['train'],
    'test': test_valid['test']
})
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    eval: Dataset({
        features: ['question', 'answer'],
        num_rows: 659
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 660
    })
})

#### Training LLM

In [8]:
# preprocess data
def preprocessDataForPromptFT(data):
    batch_size = len(data["question"])
    questions = [f"Question : {question}" for question in data["question"]]
     # Extract the steps and the final answer
    answers = []
    for elem in data["answer"]:
        steps, final_answer = elem.split("####")
        final_answer = final_answer.strip(" ")
        steps = re.sub(r'<<.*?>>', '', steps)
        answers.append("Steps: " + steps.replace("\n", "") + " Answer: " + final_answer)
    # Tokenize input text and labels
    model_inputs = tokenizer(questions)
    outputs = tokenizer(answers)
    # Pad the labels
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        output_input_ids = outputs["input_ids"][i] + [tokenizer.pad_token_id]
        # concatenate the input ids of question and the steps 
        model_inputs["input_ids"][i] = sample_input_ids + output_input_ids
        # Replace padding token id's of the labels by -100 so it's not taken into consideration by the loss
        outputs["input_ids"][i] = [-100] * len(sample_input_ids) + output_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])


    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        output_input_ids = outputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_target_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_target_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        outputs["input_ids"][i] = [-100] * (max_target_length - len(sample_input_ids)) + output_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_target_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_target_length])
        outputs["input_ids"][i] = torch.tensor(outputs["input_ids"][i][:max_target_length])
    model_inputs["labels"] = outputs["input_ids"]
    return model_inputs

In [9]:
# Tokenizer
tokenizer = T5Tokenizer.from_pretrained(base_model)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [10]:
tokenized_dataset = data.map(preprocessDataForPromptFT, batched = True)
tokenized_dataset

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/659 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7473
    })
    eval: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 659
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 660
    })
})

In [11]:
# Modified data collator
def my_collator(batch):
    # data collater expects tensors. remove answer and question from batch and collate.
    answers = [sample.pop('answer') for sample in batch]
    questions = [sample.pop('question') for sample in batch]
    collated_batch = data_collator(batch)
    # add question and answer back
    collated_batch['answer'] = answers
    collated_batch['question'] = questions
    return collated_batch

In [12]:
# Data loaders

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=train_batch_size, collate_fn=my_collator, pin_memory=True
)
eval_dataloader = DataLoader(
    tokenized_dataset["eval"], batch_size=eval_batch_size, collate_fn=my_collator, pin_memory=True
)
test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=test_batch_size, collate_fn=my_collator, pin_memory=True
)

In [None]:
# PEFT config - prompt tuning 
peft_config = PromptTuningConfig(
    task_type=task_type,
    prompt_tuning_init=prompt_tuning_init,
    num_virtual_tokens=num_virtual_tokens,
    prompt_tuning_init_text=prompt,
    tokenizer_name_or_path=base_model,
)

In [13]:
# load the model
model = T5ForConditionalGeneration.from_pretrained(base_model)
model = get_peft_model(model, peft_config).to(device)
print(model.print_trainable_parameters())

trainable params: 20,480 || all params: 783,170,560 || trainable%: 0.0026150114733628394
None


In [14]:
print_gpu_utilization()

GPU memory occupied: 3461 MB.


In [18]:
# optimizer
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

# Eval Metric
rouge_score = evaluate.load("rouge")

In [None]:
# training loop
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    # train
    model.train()
    total_loss = 0.0
    num_batches = len(train_dataloader)
    for idx, batch in enumerate(train_dataloader):
        # if idx == n_batches:
        #     break
        inputs = {k: v.to(device) for k, v in batch.items() if k not in ['answer', 'question']}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        
        progress_bar.update(1)
        progress_bar.set_description(f"train_loss: {loss.item():.5f}")
    epoch_loss = total_loss/num_batches
    
    # Eval loop
    model.eval()
    for idx, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            inputs = {k: v.to(device) for k,v in batch.items() if k not in ['answer','labels', 'question']}
            outputs = model.generate(**inputs, max_length=max_length_output)
            # decode output
            predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
            # add new line before each line for rougeL
            predictions = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in predictions]
            references = [re.sub(r'<<.*?>>', '', ref).replace("####", "The Answer:") for ref in batch["answer"]]
            rouge_score.add_batch(predictions=predictions, references=references)
    
    # compute score for the epoch
    result = rouge_score.compute()
    # display the training loss and RougeLsum - f1 score
    print(f"Epoch {epoch + 1}, Training Loss: {epoch_loss:.5f}, RougeLsum(F1 score):{result['rougeLsum']}")


# save
model.save_pretrained(trained_model_path)


#### Test

In [24]:
import csv
def saveResponse(filePath, data):
    with open(filePath, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerows([["Question", "Generated", "Actual"]])
        csv_writer.writerows(data)
    print(f'Response saved to: {filePath}')

In [25]:
# load trained model 
tokenizer = T5Tokenizer.from_pretrained(base_model)

# Load the pre trained model
config = PeftConfig.from_pretrained(trained_model_path)
pretrained_model = T5ForConditionalGeneration.from_pretrained(config.base_model_name_or_path)
pretrained_model = PeftModel.from_pretrained(pretrained_model, trained_model_path).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# test loop
responses = []
for idx, batch in enumerate(test_dataloader):
    inputs = {k: v.to(device) for k,v in batch.items() if k not in ['answer','labels', 'question']}
    outputs = pretrained_model.generate(**inputs, max_length=max_length_output)
    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    questions = batch['question']
    answers = batch['answer']
    batch_responses = list(zip(questions, predictions, answers))
    responses.extend(batch_responses)
    if(idx%100 == 0):
        print(f"{idx+1}/{len(test_dataloader)}")
# save responses for test data
saveResponse(test_responses_path, responses)