<a href="https://colab.research.google.com/github/ankits1089/FineTune-LLM/blob/main/Finetune_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # all output p


In [None]:
# Automatically detect device (GPU > MPS > CPU)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

print(f"Using device: {device}")

Using device: cpu


In [1]:
# # Load the model and tokenizer
model_name = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load model & move to device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)


In [None]:
# tokenize prompt
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer("What is AI?", return_tensors="pt",padding=True, truncation=True,max_length=100)

In [None]:
# generation
outputs = model.generate(**inputs.to(device))
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is AI?

What is the nearest to -1/2 in -1, -0.5,


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My Drive/Data Science/Projects

/content/drive/My Drive/Data Science/Projects


In [None]:
# create a question and answer set in a list with dictionary

from datasets import Dataset

QA_list = []

with open ('QA.txt') as doc:
    Q_counter = 0
    for line in doc:
        if 'Question' in line:
            Q_dict = {'question':line.replace('Question:"',"").strip().strip('"'),'answer':''}
            QA_list.append(Q_dict)
            Q_counter+=1
        elif 'Answer' in line:
            A_counter = Q_counter-1
            modified_line = line.replace('Answer:"',"").strip().strip('"')
            QA_list[A_counter]['answer'] = modified_line
        else:
            pass

dataset = Dataset.from_list(QA_list)

In [None]:
# check dataset
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 77
})

In [None]:
# Tokenize as question answer pair which will be used for training

def tokenize_function(examples):
    text = examples["question"] + examples["answer"]


    tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = "left"
    tokenizer.truncation_side = "right"
    tokenized_inputs = tokenizer(
        text,
        # return_tensors="pt",
        # padding="max_length",
        padding=True,
        truncation=True,
        max_length=30
    )

    return tokenized_inputs

In [2]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=False,
    # batch_size=1,
    # drop_last_batch=True
)

print(tokenized_dataset)

In [None]:
# check one record
tokenized_dataset[0]

{'question': 'What are the two properties of good communication?',
 'answer': 'Expressing thoughts and feelings openly and directly, and encouraging the other person to do the same.',
 'input_ids': [1276,
  403,
  253,
  767,
  3607,
  273,
  1175,
  5511,
  32,
  5892,
  13537,
  7906,
  285,
  10450,
  22134,
  285,
  3587,
  13,
  285,
  18462,
  253,
  643,
  1436,
  281,
  513,
  253,
  1072,
  15],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [None]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [None]:
# Prepare train test split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 69
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
})


In [None]:
def inference(text, model, tokenizer, max_input_tokens=512, max_output_tokens=30):
  # Tokenize
  input_ids = tokenizer(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  # device = "mps" if torch.backends.mps.is_available() else "cpu"

  generated_tokens_with_prompt = model.generate(
      **input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [None]:
test_text = split_dataset['test'][0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {split_dataset['test'][0]['answer']}")
print("\nModel's answer: ")
print(inference(test_text, model, tokenizer))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): What is the national bird of the United States?
Correct answer from Lamini docs: Please let's keep our discussion related to good or bad communication.

Model's answer: 


The bird is a species of bird that is found in the United States. It is found


In [None]:
# creating backup incase need to start over
model_bckup = model

In [None]:
from transformers import TrainingArguments, Trainer

# initialising training arguments
training_args = TrainingArguments(

  # Learning rate
  learning_rate=5.0e-7,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=128,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir='fine_tuned_pythia',

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=16, # Number of update steps between two evaluations
  save_steps=16, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  eval_strategy="steps",
  logging_strategy="steps",
  # fp16=False,
  logging_steps=4,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=2,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"])


# Train the model
trainer.train()

Step,Training Loss,Validation Loss
16,1.4822,2.303078
32,1.7252,2.322365
48,1.9192,2.291938
64,1.6652,2.311355
80,1.7526,2.298114
96,1.8205,2.309497
112,1.8057,2.327574
128,1.4083,2.299857


TrainOutput(global_step=128, training_loss=1.6293511614203453, metrics={'train_runtime': 25.4905, 'train_samples_per_second': 20.086, 'train_steps_per_second': 5.021, 'total_flos': 2962757812224.0, 'train_loss': 1.6293511614203453, 'epoch': 7.115942028985507})

In [None]:
# load the trained model

checkpoint_path = "fine_tuned_pythia/checkpoint-256"

model_finetuned = AutoModelForCausalLM.from_pretrained(
    checkpoint_path,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

Check the model output on the question - what is the larget lake?

In [None]:
# tokenize prompt
question = "what is the larget lake?"
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer(question, return_tensors="pt",padding=True, truncation=True,max_length=100)

# generation
outputs = model_finetuned.generate(**inputs.to(device))
print("Question: ",question,"\nAnswer:",tokenizer.decode(outputs[0], skip_special_tokens=True)[len(question):])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question:  what is the larget lake? 
Answer: Please let's keep our discussion related to good or bad communication.


Compare the result with that of the base model

In [None]:
model_name = "EleutherAI/pythia-70m"
# Load model & move to device
model_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)


In [None]:
# generation
outputs = model_base.generate(**inputs.to(device))
print("Question: ",question,"\nAnswer:",tokenizer.decode(outputs[0], skip_special_tokens=True)[len(question):])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question:  what is the larget lake? 
Answer: 

A:

The answer is that the larget lake is a lake.  It


Trained model output from QA document vs base model

In [None]:
# mention question number
qno = 24

test_text = split_dataset['train'][qno]['question']
print("Question input (test):", test_text)
print(f"\nCorrect answer from Lamini docs: {split_dataset['train'][qno]['answer']}")
print("\nModel's answer: ")
print(inference(test_text, model_finetuned, tokenizer))
print("\nBase model's answer: ")
print(inference(test_text, model_base, tokenizer))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): How does defensiveness block productive conversation?

Correct answer from Lamini docs: It prevents the acknowledgment of mistakes and stops problem-solving.

Model's answer: 


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


It creates a sense of empathy and empathy that can be shared with others. Does defensiveness block communication

Base model's answer: 


A:

I think you're right.  I think you're right.  I
