<a href="https://colab.research.google.com/github/ankritRisal/Finetuning_LLM/blob/main/Train_finetunned_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Play with data



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

from torch.utils.data import Dataset, DataLoader

from google.colab import userdata
from huggingface_hub import login

# Get the token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

login(token=hf_token)


In [4]:
# model_id = "meta-llama/Llama-3.2-3B-Instruct"
model_id = "meta-llama/Llama-3.2-1B-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side ="left")
tokenizer.pad_token = tokenizer.eos_token # padding tokens to make of same shape

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             dtype = torch.bfloat16,
                                             device_map = device)

Loading weights:   0%|          | 0/146 [00:00<?, ?it/s]

# Visualization of prediction

In [5]:
# WHAT DO BATCH PROMPT SHOULD INCLUDE ? => CHAT_TEMPLATE + " " + CATEGORY_TEMPLATE
# CATEGORY TEMPLATE => TITLE AND DESCRIPTION WITH VALID CATEGORY AS ANSWER (SHOULD OPERATE IN LOOP)
#   SYSTEM PROMPT , PORT PROMPT > CATEGORY TEMPLATE

In [6]:
df = pd.read_csv("/content/drive/MyDrive/X3s4c5/FinetunningLLMmodels/book_description.csv")
df.drop(["Unnamed: 0", "Unnamed: 0.1", "Price", "Avilability", "Stars", ], axis = 1, inplace = True)

In [7]:
df

Unnamed: 0,Title,Category,Book_Description
0,A Light in the Attic,Poetry,It's hard to imagine a world without A Light i...
1,Tipping the Velvet,Historical Fiction,"""Erotic and absorbing...Written with starling ..."
2,Soumission,Fiction,"Dans une France assez proche de la nÃ´tre, un ..."
3,Sharp Objects,Mystery,"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,History,From a renowned historian comes a groundbreaki...
...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,Classics,\n\n\n\n\n\n
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",Sequential Art,High school student Kei Nagai is struck dead i...
997,A Spy's Devotion (The Regency Spies of London #1),Historical Fiction,"In Englandâs Regency era, manners and elegan..."
998,1st to Die (Women's Murder Club #1),Mystery,"James Patterson, bestselling author of the Ale..."


In [8]:
df["Category"].unique()

array(['Poetry', 'Historical Fiction', 'Fiction', 'Mystery', 'History',
       'Young Adult', 'Business', 'Default', 'Sequential Art', 'Music',
       'Science Fiction', 'Politics', 'Travel', 'Thriller',
       'Food and Drink', 'Romance', 'Childrens', 'Nonfiction', 'Art',
       'Spirituality', 'Philosophy', 'New Adult', 'Contemporary',
       'Fantasy', 'Add a comment', 'Science', 'Health', 'Horror',
       'Self Help', 'Religion', 'Christian', 'Crime', 'Autobiography',
       'Christian Fiction', 'Biography', 'Womens Fiction', 'Erotica',
       'Cultural', 'Psychology', 'Humor', 'Historical', 'Novels',
       'Short Stories', 'Suspense', 'Classics', 'Academic',
       'Sports and Games', 'Adult Fiction', 'Parenting', 'Paranormal'],
      dtype=object)

In [9]:
df["Category"].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Default,152
Nonfiction,110
Sequential Art,75
Add a comment,67
Fiction,65
Young Adult,54
Fantasy,48
Romance,35
Mystery,32
Food and Drink,30


In [10]:
df = df[~df["Category"].isin(["Add a comment", "Default"])]

In [11]:
filtered = df[df["Category"] == "Add a comment"]
print(filtered)

Empty DataFrame
Columns: [Title, Category, Book_Description]
Index: []


In [12]:
label_col = "Category"
counts = df[label_col].value_counts()
valid_classes = counts[counts >= 10].index
df_filtered = df[df[label_col].isin(valid_classes)]

In [13]:
df_sampled = (
    df_filtered
    .groupby(label_col, group_keys=False)
    .apply(lambda x: x.sample(n=5, random_state=42))
    .reset_index(drop=True)
)


  .apply(lambda x: x.sample(n=5, random_state=42))


In [14]:
df = df_sampled
df.head(10)

Unnamed: 0,Title,Category,Book_Description
0,The Lean Startup: How Today's Entrepreneurs Us...,Business,Most startups fail. But many of those failures...
1,"Rich Dad, Poor Dad",Business,Personal finance author and lecturer Robert T....
2,The Dirty Little Secrets of Getting Your Dream...,Business,Drawing on his extensive experience evaluating...
3,The E-Myth Revisited: Why Most Small Businesse...,Business,"E-Myth 'e-,'mith n 1: the entrepreneurial myt..."
4,Quench Your Own Thirst: Business Lessons Learn...,Business,"Founder of The Boston Beer Company, brewer of ..."
5,Matilda,Childrens,Matilda is a little girl who is far too good t...
6,The Wild Robot,Childrens,When robot Roz opens her eyes for the first ti...
7,"Shrunken Treasures: Literary Classics, Short, ...",Childrens,Nine weighty literary classics are transformed...
8,Horrible Bear!,Childrens,The New York Times-bestselling duo behind Wolf...
9,Once Was a Time,Childrens,"In the war-ravaged England of 1940, Charlotte ..."


In [15]:
counts = df[label_col].value_counts()
counts

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Business,5
Childrens,5
Classics,5
Fantasy,5
Fiction,5
Food and Drink,5
Historical Fiction,5
History,5
Horror,5
Humor,5


In [16]:
# find out irregular terms in book_description
filtered = df[df["Book_Description"] == "\n\n\n\n\n\n"]
print(filtered)

Empty DataFrame
Columns: [Title, Category, Book_Description]
Index: []


In [17]:
short_rows = df[df["Book_Description"].str.len().fillna(0) < 20]
short_rows

Unnamed: 0,Title,Category,Book_Description


In [18]:
# Build new df column to store prompt and tokenize using chat_template
def build_prompt(row):
    valid_category = list(df["Category"].unique())
    SYSTEM_PROMPT =  \
      {
        "role" : "system",
        "content" : f""" You are an AI system that reads an Title and Book Description and classifies category of the book applied, you must
        choose from the following classes:
        {"\n or ". join(["Labeled Category:" + x for x in list(valid_category)])}.
        Ensure Output is from above list only"""
        }

    ASSISTANT_MESSAGES = \
      {
        "role" : "assistant",
        "content" : "Labeled Category :"
        }

    USER_MESSAGES = {
        "role": "user",
        "content": f"""Title: {row['Title']}, Description: {row['Book_Description']}"""
    }

    prompt = [
        SYSTEM_PROMPT,
        USER_MESSAGES,
        ASSISTANT_MESSAGES
    ]
    # print(prompt)
    tokenized_prompt = tokenizer.apply_chat_template(prompt, continue_final_message= True, tokenize= False)
    return tokenized_prompt

# Data Seperation into Train and Test

In [19]:
random_seed = 32
train_size = 0.80
df["prompt"] = df.apply(build_prompt, axis=1)
df = df.sample(frac=1, random_state = random_seed).reset_index(drop=True).reset_index()
train_len = int(train_size * len(df))
df_train= df[:train_len]
df_test = df[train_len:]

In [20]:
df_train

Unnamed: 0,index,Title,Category,Book_Description,prompt
0,0,"Unbound: How Eight Technologies Made Us Human,...",History,Although we usually think of technology as som...,<|begin_of_text|><|start_header_id|>system<|en...
1,1,Tipping the Velvet,Historical Fiction,"""Erotic and absorbing...Written with starling ...",<|begin_of_text|><|start_header_id|>system<|en...
2,2,A Light in the Attic,Poetry,It's hard to imagine a world without A Light i...,<|begin_of_text|><|start_header_id|>system<|en...
3,3,orange: The Complete Collection 1 (orange: The...,Sequential Art,A Plea From the FutureOn the day that Naho beg...,<|begin_of_text|><|start_header_id|>system<|en...
4,4,Legend (Legend #1),Young Adult,There is an alternate cover edition for this I...,<|begin_of_text|><|start_header_id|>system<|en...
...,...,...,...,...,...
87,87,Zealot: The Life and Times of Jesus of Nazareth,History,From the internationally bestselling author of...,<|begin_of_text|><|start_header_id|>system<|en...
88,88,The Stranger,Philosophy,This is an alternate cover edition for ISBN 06...,<|begin_of_text|><|start_header_id|>system<|en...
89,89,The Cuckoo's Calling (Cormoran Strike #1),Mystery,A BRILLIANT DEBUT MYSTERY IN A CLASSIC VEIN: D...,<|begin_of_text|><|start_header_id|>system<|en...
90,90,Old Records Never Die: One Man's Quest for His...,Music,"Foreword by Wilco's Jeff Tweedy""Memories are f...",<|begin_of_text|><|start_header_id|>system<|en...


In [21]:
df_test

Unnamed: 0,index,Title,Category,Book_Description,prompt
92,92,The Secret Garden,Classics,When orphaned Mary Lennox comes to live at her...,<|begin_of_text|><|start_header_id|>system<|en...
93,93,The Most Perfect Thing: Inside (and Outside) a...,Science,Renowned ornithologist Tim Birkhead opens this...,<|begin_of_text|><|start_header_id|>system<|en...
94,94,Psycho: Sanitarium (Psycho #1.5),Horror,The original Psycho novel by Robert Bloch was ...,<|begin_of_text|><|start_header_id|>system<|en...
95,95,Tuesday Nights in 1980,Fiction,"âIn one sentence, Ms. Prentiss captures a se...",<|begin_of_text|><|start_header_id|>system<|en...
96,96,Sapiens: A Brief History of Humankind,History,From a renowned historian comes a groundbreaki...,<|begin_of_text|><|start_header_id|>system<|en...
97,97,The Invention of Wings,Historical Fiction,Writing at the height of her narrative and ima...,<|begin_of_text|><|start_header_id|>system<|en...
98,98,"At The Existentialist CafÃ©: Freedom, Being, a...",Philosophy,"Paris, near the turn of 1933. Three young frie...",<|begin_of_text|><|start_header_id|>system<|en...
99,99,I Had a Nice Time And Other Lies...: How to fi...,Womens Fiction,The New York Times bestselling authors of Nice...,<|begin_of_text|><|start_header_id|>system<|en...
100,100,"Surely You're Joking, Mr. Feynman!: Adventures...",Science,"Richard Feynman (1918-1988), winner of the Nob...",<|begin_of_text|><|start_header_id|>system<|en...
101,101,Gone with the Wind,Classics,Margaret Mitchell's epic novel of love and war...,<|begin_of_text|><|start_header_id|>system<|en...


# DataLoader

In [22]:
from datasets import Dataset #defined by huggingface not pandas or tf
batch_size = 2
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle= False)

# Training

In [23]:
def generate_input_output_pair(batch_prompts, batch_targets):
  """ Pass data batch prompt ie: df["prompt"]. along with the  target value ie: df["Category"]
  Now this is passed through tokenizer and padding processes
  """
  full_response_text = [
      (b_prompt + " " + target + tokenizer.eos_token)
      for b_prompt, target in zip(batch_prompts, batch_targets)
      ]
  input_ids_tokenized = tokenizer(full_response_text, add_special_tokens = False, return_tensors ="pt", padding =True)["input_ids"]
  label_tokenized = tokenizer([" " + target + tokenizer.eos_token for target in batch_targets], add_special_tokens = False,
                              return_tensors ="pt", padding = "max_length", max_length = input_ids_tokenized.shape[1])["input_ids"]

  label_tokenized_fixed = torch.where(label_tokenized != tokenizer.pad_token_id, label_tokenized, -100)
  label_tokenized_fixed[:, -1] = tokenizer.eos_token_id

  input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
  label_tokenized_right_shifted = label_tokenized_fixed[:, 1:]

  attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id

  return {
      "input_ids" : input_ids_tokenized_left_shifted,
      "attention_mask" : attention_mask,
      "labels" : label_tokenized_right_shifted
  }

In [24]:
import torch.nn as nn
from torch.optim import AdamW

def calculate_loss(logits, labels):
  loss_fn = nn.CrossEntropyLoss()
  entropyloss = loss_fn(logits.view(-1, logits.size(-1)), labels.reshape(-1)) # based on the tensor input, tensors are reshape to match broadcasting issues
  return entropyloss

# **LORA ADAPTOR**

In [25]:
import torch
torch.cuda.max_memory_allocated() / 1e9

2.4726784

In [26]:
# LORA ADAPTOR
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type = "CAUSAL_LM",
    r = 4,
    lora_alpha = 16,
    lora_dropout = 0.05,
    target_modules = ['q_proj', 'v_proj']
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
model.gradient_checkpointing_enable()

trainable params: 425,984 || all params: 1,236,240,384 || trainable%: 0.0345


In [27]:
## DONE : in order to maintain forward tensor and recomputation tensor numbers

# Check if gradient checkpointing is enabled
print(f"Gradient checkpointing enabled: {model.is_gradient_checkpointing}")

# Or check the base model
if hasattr(model.base_model, 'gradient_checkpointing'):
    print(f"Base model checkpointing: {model.base_model.gradient_checkpointing}")

Gradient checkpointing enabled: True
Base model checkpointing: True


# **DEFINE TRAINING LOOP**

In [28]:
epochs = 4
optimizer = AdamW(model.parameters(), lr = 1e-3, weight_decay= 0.01)

for epoch in range(epochs):
  running_loss = 0.0
  count = 0

  for batch in train_dataloader :
    data = generate_input_output_pair(batch_prompts= batch['prompt'], batch_targets= batch['Category'])
    out = model(input_ids = data["input_ids"].to(device))
    loss = calculate_loss(out.logits, data["labels"].to(device))
    count += 1
    print(f"count/epoch: {count}/{epoch}")

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    running_loss += loss.item()

  avg_loss = running_loss / len(train_dataloader)
  print(f"avg_loss: {avg_loss}, running_loss : {running_loss}")

count/epoch: 1/0
count/epoch: 2/0
count/epoch: 3/0
count/epoch: 4/0
count/epoch: 5/0
count/epoch: 6/0
count/epoch: 7/0
count/epoch: 8/0
count/epoch: 9/0
count/epoch: 10/0
count/epoch: 11/0
count/epoch: 12/0
count/epoch: 13/0
count/epoch: 14/0
count/epoch: 15/0
count/epoch: 16/0
count/epoch: 17/0
count/epoch: 18/0
count/epoch: 19/0
count/epoch: 20/0
count/epoch: 21/0
count/epoch: 22/0
count/epoch: 23/0
count/epoch: 24/0
count/epoch: 25/0
count/epoch: 26/0
count/epoch: 27/0
count/epoch: 28/0
count/epoch: 29/0
count/epoch: 30/0
count/epoch: 31/0
count/epoch: 32/0
count/epoch: 33/0
count/epoch: 34/0
count/epoch: 35/0
count/epoch: 36/0
count/epoch: 37/0
count/epoch: 38/0
count/epoch: 39/0
count/epoch: 40/0
count/epoch: 41/0
count/epoch: 42/0
count/epoch: 43/0
count/epoch: 44/0
count/epoch: 45/0
count/epoch: 46/0
avg_loss: 1.2715056046195652, running_loss : 58.4892578125
count/epoch: 1/1
count/epoch: 2/1
count/epoch: 3/1
count/epoch: 4/1
count/epoch: 5/1
count/epoch: 6/1
count/epoch: 7/1
cou

# Testing proportion

In [29]:
def extract_labels_from_output(decode_batch, tokenizer):
  labels = []
  for d in decode_batch:
    # print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    # print(d)
    label = d.split("Labeled Category :")[1].strip()
    # print(label)
    labels.append(label)
  # labels = [d.split("<|start_header_id|>assistant<|end_header_id|>\n\nLabeled Category:")[0].split("<|eot_id|>")[0].strip() for d in decode_batch]
  # print(labels)
  # labels  = [d.split("<|start_header_id|>assistant<|end_header_id|>\n\nLabeled Category:")[1].split("<|eot_id|>")[0].strip() for d in decode_batch]
  return labels

In [30]:
def generate_outputs(prompts, model, tokenizer):
  tokenizer.pad_token = tokenizer.eos_token
  tokenized = tokenizer(prompts, padding= True, return_tensors ="pt", add_special_tokens= False).to(device)

  # tokenized = tokenizer.apply_chat_template(prompts, padding= False, return_tensors ="pt", add_special_tokens= False).to(device)
  output_batch = peft_model.generate(input_ids = tokenized["input_ids"], attention_mask =tokenized["attention_mask"], max_new_tokens = 20, do_sample= False, temperature = 0, top_p =1)
  # output_batch = model.generate(tokenized, max_new_tokens = 20)
  decode_batch = tokenizer.batch_decode(output_batch, skip_special_tokens= True)
  prediction = extract_labels_from_output(decode_batch, tokenizer)
  return prediction

In [38]:
def test_model(dataloader, model, tokenizer):
  comparison_df= {
      "predictions": [],
      "labels": []
  }

  for batch in dataloader:
    predictions = generate_outputs(prompts= batch["prompt"], model = model, tokenizer= tokenizer) # prompts = batch["prompt"]
    comparison_df["labels"].extend(batch["Category"])
    comparison_df["predictions"].extend(predictions)

  comparison_df = pd.DataFrame(comparison_df)
  accuracy = (comparison_df["labels"] == comparison_df["predictions"]).mean()
  num_invalid_pred = (~comparison_df["predictions"].isin(valid_category)).mean()
  print(comparison_df.head(10))
  return {"accuracy ": accuracy,"invalid_predictions": num_invalid_pred}
  # return comparison_df

In [42]:
# # from datasets import Dataset #defined by huggingface not pandas or tf
test_Dataset = Dataset.from_pandas(df_train[10:25]) # testing for trained data
test_dataloader = DataLoader(test_Dataset, shuffle= False)
metrics = test_model(test_dataloader, model, tokenizer)
print("\n".join([f"{k} = {v}" for k, v in metrics.items()]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

       predictions           labels
0            Humor   Sequential Art
1          Mystery         Thriller
2         Classics         Classics
3           Horror           Horror
4  Science Fiction  Science Fiction
5   Sequential Art   Sequential Art
6      Young Adult      Young Adult
7         Business         Business
8            Humor            Humor
9   Food and Drink   Food and Drink
accuracy  = 0.8
invalid_predictions = 0.0
