<a href="https://colab.research.google.com/github/aaron-v19/mlprojects/blob/main/Fine_Tuning_LLM_Llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyarrow==15.0.2



In [2]:
!pip install -q accelerate -U
!pip install -q bitsandbytes -U

!pip install -q peft -U
!pip install -q transformers -U
!pip install -q datasets -U
!pip install -q gradio -U

In [3]:
!pip install numpy==1.25.2
!pip install trl==0.12.0



In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
import torch
from trl import SFTTrainer

In [5]:
dataset=load_dataset("HuggingFaceH4/ultrachat_200k",trust_remote_code=True,split="train_sft")
dataset=dataset.shuffle().select(range(10000))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
dataset

Dataset({
    features: ['prompt', 'prompt_id', 'messages'],
    num_rows: 10000
})

In [7]:
from transformers import AutoTokenizer

template_tokenizer=AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

template_tokenizer

LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [8]:
def format_prompt(example):
    """ Format the prompt using the <|user|> and <|assistant|> format """

    chat=example["messages"]
    prompt=template_tokenizer.apply_chat_template(chat, tokenize=False)

    return {'text':prompt}

dataset=dataset.map(format_prompt)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# Testing Base LLAMA Model

In [9]:
from transformers import pipeline

model_id="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

pipe=pipeline("text-generation",model=model_id,device_map="auto")

#prompt
#<|user|>,<|assistant|>

prompt ="""<|user|>
Tell me about Large Language Models. </s>
<|assistant|>
"""

output=pipe(prompt)

print(output[0]["generated_text"])

Device set to use cuda:0


<|user|>
Tell me about Large Language Models. </s>
<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|assistant|>

<|as

 # Model Configuration for training

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments


In [11]:
# do the 4bit quantization in Q-Lora

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True
)

In [12]:
tokenizer=AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)

tokenizer.pad_token="<PAD>"
tokenizer.padding_side="left"

In [13]:
model=AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

In [14]:
model.config.use_cache=False
model.config.pretraining_tp=1

# Prepare LoRA Config for PEFT Fine Tuning

In [15]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), e

In [16]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config=LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)


model=prepare_model_for_kbit_training(model)

model=get_peft_model(model,peft_config)

In [17]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

# Model Fine Tuning

In [19]:
from transformers import TrainingArguments,Trainer
from trl import SFTTrainer

output_dir="./train_dir"

args=TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        num_train_epochs=1,
        logging_steps=10,
        fp16=True,
        gradient_checkpointing=True
    )

tokenizer.chat_template = template_tokenizer.chat_template
trainer=SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=args
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maaronjosephvazhuthanapillil[0m ([33maaronjosephvazhuthanapillil-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.594
20,1.544
30,1.4678
40,1.434
50,1.3618
60,1.4121
70,1.4541
80,1.3559
90,1.4959
100,1.3958


TrainOutput(global_step=1250, training_loss=1.357313150024414, metrics={'train_runtime': 4618.9677, 'train_samples_per_second': 2.165, 'train_steps_per_second': 0.271, 'total_flos': 3.331891275927552e+16, 'train_loss': 1.357313150024414, 'epoch': 1.0})

In [20]:
trainer.save_model("TinyLlama-1.1B-qlora")

# Load Pre_trained PEFT Model for Prediction

In [21]:
from peft import AutoPeftModelForCausalLM

model=AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    trust_remote_code=True,
    device_map="auto"
)

merged_model=model.merge_and_unload()

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [22]:
merged_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [23]:
import gradio as gr

In [29]:
from transformers import pipeline,AutoTokenizer

model_id="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


tokenizer=AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)
tokenizer.pad_token="<PAD>"
tokenizer.padding_side="left"



In [27]:
def predict(prompt):
    prompt = f"{prompt}"
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
    output = pipe(prompt)
    return output[0]["generated_text"]

# Create the Gradio interface
iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    outputs="text",
    title="TinyLlama Chatbot",
    description="Ask me anything!",
)



In [28]:
# Launch the interface
iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d443eae423b3f6665b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [30]:
!zip -r /content/tiny_llama_qlora_adapter.zip /content/TinyLlama-1.1B-qlora

  adding: content/TinyLlama-1.1B-qlora/ (stored 0%)
  adding: content/TinyLlama-1.1B-qlora/training_args.bin (deflated 51%)
  adding: content/TinyLlama-1.1B-qlora/tokenizer.json (deflated 85%)
  adding: content/TinyLlama-1.1B-qlora/README.md (deflated 66%)
  adding: content/TinyLlama-1.1B-qlora/adapter_config.json (deflated 56%)
  adding: content/TinyLlama-1.1B-qlora/tokenizer.model (deflated 55%)
  adding: content/TinyLlama-1.1B-qlora/adapter_model.safetensors (deflated 8%)
  adding: content/TinyLlama-1.1B-qlora/special_tokens_map.json (deflated 72%)
  adding: content/TinyLlama-1.1B-qlora/tokenizer_config.json (deflated 67%)
