<a href="https://colab.research.google.com/github/ankritRisal/Finetuning_LLM/blob/main/Train_finetunned_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Play with data



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

from torch.utils.data import Dataset, DataLoader

from google.colab import userdata
from huggingface_hub import login

# Get the token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

login(token=hf_token)


In [None]:
# model_id = "meta-llama/Llama-3.2-3B-Instruct"
model_id = "meta-llama/Llama-3.2-1B-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side ="left")
tokenizer.pad_token = tokenizer.eos_token # padding tokens to make of same shape

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             dtype = torch.bfloat16,
                                             device_map = device)

# Visualization of prediction

In [None]:
# WHAT DO BATCH PROMPT SHOULD INCLUDE ? => CHAT_TEMPLATE + " " + CATEGORY_TEMPLATE
# CATEGORY TEMPLATE => TITLE AND DESCRIPTION WITH VALID CATEGORY AS ANSWER (SHOULD OPERATE IN LOOP)
#   SYSTEM PROMPT , PORT PROMPT > CATEGORY TEMPLATE

In [None]:
df = pd.read_csv("/content/drive/MyDrive/X3s4c5/FinetunningLLMmodels/book_description.csv")
df.drop(["Unnamed: 0", "Unnamed: 0.1", "Price", "Avilability", "Stars", ], axis = 1, inplace = True)

In [None]:
df

In [None]:
df["Category"].unique()

In [None]:
df["Category"].value_counts()

In [None]:
df = df[~df["Category"].isin(["Add a comment", "Default"])]

In [None]:
filtered = df[df["Category"] == "Add a comment"]
print(filtered)

In [None]:
label_col = "Category"
counts = df[label_col].value_counts()
valid_classes = counts[counts >= 10].index
df_filtered = df[df[label_col].isin(valid_classes)]

In [None]:
df_sampled = (
    df_filtered
    .groupby(label_col, group_keys=False)
    .apply(lambda x: x.sample(n=5, random_state=42))
    .reset_index(drop=True)
)


In [None]:
df = df_sampled
df.head(10)

In [None]:
counts = df[label_col].value_counts()
counts

In [None]:
# find out irregular terms in book_description
filtered = df[df["Book_Description"] == "\n\n\n\n\n\n"]
print(filtered)

In [None]:
short_rows = df[df["Book_Description"].str.len().fillna(0) < 20]
short_rows

In [None]:
# Build new df column to store prompt and tokenize using chat_template
def build_prompt(row):
    valid_category = list(df["Category"].unique())
    SYSTEM_PROMPT =  \
      {
        "role" : "system",
        "content" : f""" You are an AI system that reads an Title and Book Description and classifies category of the book applied, you must
        choose from the following classes:
        {"\n or ". join(["Labeled Category:" + x for x in list(valid_category)])}.
        Ensure Output is from above list only"""
        }

    ASSISTANT_MESSAGES = \
      {
        "role" : "assistant",
        "content" : "Labeled Category :"
        }

    USER_MESSAGES = {
        "role": "user",
        "content": f"""Title: {row['Title']}, Description: {row['Book_Description']}"""
    }

    prompt = [
        SYSTEM_PROMPT,
        USER_MESSAGES,
        ASSISTANT_MESSAGES
    ]
    # print(prompt)
    tokenized_prompt = tokenizer.apply_chat_template(prompt, continue_final_message= True, tokenize= False)
    return tokenized_prompt

# Data Seperation into Train and Test

In [None]:
random_seed = 32
train_size = 0.80
df["prompt"] = df.apply(build_prompt, axis=1)
df = df.sample(frac=1, random_state = random_seed).reset_index(drop=True).reset_index()
train_len = int(train_size * len(df))
df_train= df[:train_len]
df_test = df[train_len:]

In [None]:
df_train

In [None]:
df_test

# DataLoader

In [None]:
from datasets import Dataset #defined by huggingface not pandas or tf
batch_size = 2
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle= False)

# Training

In [None]:
def generate_input_output_pair(batch_prompts, batch_targets):
  """ Pass data batch prompt ie: df["prompt"]. along with the  target value ie: df["Category"]
  Now this is passed through tokenizer and padding processes
  """
  full_response_text = [
      (b_prompt + " " + target + tokenizer.eos_token)
      for b_prompt, target in zip(batch_prompts, batch_targets)
      ]
  input_ids_tokenized = tokenizer(full_response_text, add_special_tokens = False, return_tensors ="pt", padding =True)["input_ids"]
  label_tokenized = tokenizer([" " + target + tokenizer.eos_token for target in batch_targets], add_special_tokens = False,
                              return_tensors ="pt", padding = "max_length", max_length = input_ids_tokenized.shape[1])["input_ids"]

  label_tokenized_fixed = torch.where(label_tokenized != tokenizer.pad_token_id, label_tokenized, -100)
  label_tokenized_fixed[:, -1] = tokenizer.eos_token_id

  input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
  label_tokenized_right_shifted = label_tokenized_fixed[:, 1:]

  attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id

  return {
      "input_ids" : input_ids_tokenized_left_shifted,
      "attention_mask" : attention_mask,
      "labels" : label_tokenized_right_shifted
  }

In [None]:
import torch.nn as nn
from torch.optim import AdamW

def calculate_loss(logits, labels):
  loss_fn = nn.CrossEntropyLoss()
  entropyloss = loss_fn(logits.view(-1, logits.size(-1)), labels.reshape(-1)) # based on the tensor input, tensors are reshape to match broadcasting issues
  return entropyloss

# **LORA ADAPTOR**

In [None]:
import torch
torch.cuda.max_memory_allocated() / 1e9

In [None]:
# LORA ADAPTOR
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type = "CAUSAL_LM",
    r = 4,
    lora_alpha = 16,
    lora_dropout = 0.05,
    target_modules = ['q_proj', 'v_proj']
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
model.gradient_checkpointing_enable()

In [None]:
## DONE : in order to maintain forward tensor and recomputation tensor numbers

# Check if gradient checkpointing is enabled
print(f"Gradient checkpointing enabled: {model.is_gradient_checkpointing}")

# Or check the base model
if hasattr(model.base_model, 'gradient_checkpointing'):
    print(f"Base model checkpointing: {model.base_model.gradient_checkpointing}")

# **DEFINE TRAINING LOOP**

In [None]:
epochs = 4
optimizer = AdamW(model.parameters(), lr = 1e-3, weight_decay= 0.01)

for epoch in range(epochs):
  running_loss = 0.0
  count = 0

  for batch in train_dataloader :
    data = generate_input_output_pair(batch_prompts= batch['prompt'], batch_targets= batch['Category'])
    out = model(input_ids = data["input_ids"].to(device))
    loss = calculate_loss(out.logits, data["labels"].to(device))
    count += 1
    print(f"count/epoch: {count}/{epoch}")

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    running_loss += loss.item()

  avg_loss = running_loss / len(train_dataloader)
  print(f"avg_loss: {avg_loss}, running_loss : {running_loss}")

# Testing proportion

In [None]:
def extract_labels_from_output(decode_batch, tokenizer):
  labels = []
  for d in decode_batch:
    # print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    # print(d)
    label = d.split("Labeled Category :")[1].strip()
    # print(label)
    labels.append(label)
  # labels = [d.split("<|start_header_id|>assistant<|end_header_id|>\n\nLabeled Category:")[0].split("<|eot_id|>")[0].strip() for d in decode_batch]
  # print(labels)
  # labels  = [d.split("<|start_header_id|>assistant<|end_header_id|>\n\nLabeled Category:")[1].split("<|eot_id|>")[0].strip() for d in decode_batch]
  return labels

In [None]:
def generate_outputs(prompts, model, tokenizer):
  tokenizer.pad_token = tokenizer.eos_token
  tokenized = tokenizer(prompts, padding= True, return_tensors ="pt", add_special_tokens= False).to(device)

  # tokenized = tokenizer.apply_chat_template(prompts, padding= False, return_tensors ="pt", add_special_tokens= False).to(device)
  output_batch = peft_model.generate(input_ids = tokenized["input_ids"], attention_mask =tokenized["attention_mask"], max_new_tokens = 20, do_sample= False, temperature = 0, top_p =1)
  # output_batch = model.generate(tokenized, max_new_tokens = 20)
  decode_batch = tokenizer.batch_decode(output_batch, skip_special_tokens= True)
  prediction = extract_labels_from_output(decode_batch, tokenizer)
  return prediction

In [None]:
def test_model(dataloader, model, tokenizer):
  comparison_df= {
      "predictions": [],
      "labels": []
  }

  for batch in dataloader:
    predictions = generate_outputs(prompts= batch["prompt"], model = model, tokenizer= tokenizer) # prompts = batch["prompt"]
    comparison_df["labels"].extend(batch["Category"])
    comparison_df["predictions"].extend(predictions)

  comparison_df = pd.DataFrame(comparison_df)
  accuracy = (comparison_df["labels"] == comparison_df["predictions"]).mean()
  num_invalid_pred = (~comparison_df["predictions"].isin(valid_category)).mean()
  print(comparison_df.head(10))
  return {"accuracy ": accuracy,"invalid_predictions": num_invalid_pred}
  # return comparison_df

In [None]:
# # from datasets import Dataset #defined by huggingface not pandas or tf
test_Dataset = Dataset.from_pandas(df_train[10:25]) # testing for trained data
test_dataloader = DataLoader(test_Dataset, shuffle= False)
metrics = test_model(test_dataloader, model, tokenizer)
print("\n".join([f"{k} = {v}" for k, v in metrics.items()]))