<a href="https://colab.research.google.com/github/ankesh86/LLMProjects/blob/main/FinetuningLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m122.9/244.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# **Configurable parts**
* Base model to fine-tune
* dataset to fine-tune with
* name of new fine-tuned model

In [4]:
# base model to finetume
model_name = "NousResearch/llama-2-7b-chat-hf"

# Dataset to finetune
dataset_name = "mlabonne/guanaco-llama2-1k"

# Newly fine-tuned model name\
new_model = "llama-2-7b-trial-ank"


In [5]:
dataset = load_dataset(dataset_name, split="train")

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

## **4-bit quantization**

In [7]:
# loading tokenizer
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = "nf4",
    bnd_4bit_compute_dtype = compute_dtype,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = {"": 0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1



config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

In [8]:
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout = 0.1,
    r = 64,
    bias = "none",
    task_type = "CAUSAL_LM"
)

In [10]:
training_arguments = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps = 1,
    optim="paged_adamw_32bit",
    save_steps = 25,
    logging_steps = 25,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    fp16 = False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='constant',
    report_to="tensorboard"
)

In [11]:
# LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [12]:
#Supervised fine-tuning
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
trainer.train()

trainer.model.save_pretrained(new_model)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.3467
50,1.6166
75,1.2103
100,1.4316
125,1.1769
150,1.3556
175,1.1705
200,1.4525
225,1.1533
250,1.521




In [14]:
# Run inference immediately after training on model
prompt = "What is the tallest mountain in the world?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=800)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


<s>[INST] What is the tallest mountain in the world? [/INST] The tallest mountain in the world is Mount Everest, which is located in the Himalayas between Nepal and Tibet. It stands at an elevation of 8,848 meters (29,029 feet) above sea level. Mount Everest is also known as Chomolungma or Sagarmatha, and it is considered one of the most challenging and dangerous mountains to climb due to its extreme altitude and harsh weather conditions. Despite these challenges, many climbers attempt to reach the summit of Mount Everest every year, with over 4,000 successful ascents to date.


In [16]:
# Run inference immediately after training on model
prompt = "Ques: In fasting state, secretion of glucagon aims to produce glucose mainly by?Options : ['Glycolysis & gluconeogenesis', 'Gluconeogenesis & glycogenolysis', 'Glycogenesis & gluconeogenesis', 'Only gluconeogenesis']"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=800)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Ques: In fasting state, secretion of glucagon aims to produce glucose mainly by?Options : ['Glycolysis & gluconeogenesis', 'Gluconeogenesis & glycogenolysis', 'Glycogenesis & gluconeogenesis', 'Only gluconeogenesis'] [/INST] Gluconeogenesis is the process by which glucose is produced from non-carbohydrate sources, such as amino acids, lactate, and glycerol. In a fasting state, the secretion of glucagon aims to produce glucose mainly by glycogenolysis, which is the breakdown of glycogen to glucose. Therefore, the correct answer is: Gluconeogenesis & glycogenolysis.
