<a href="https://colab.research.google.com/github/abhishekpatel10/Llama_Finetune/blob/main/Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Uninstall existing conflicting versions
!pip uninstall -y bitsandbytes accelerate transformers peft trl

# 2. Install compatible, updated versions
!pip install -U bitsandbytes
!pip install -U transformers accelerate peft trl datasets huggingface_hub

Found existing installation: bitsandbytes 0.49.1
Uninstalling bitsandbytes-0.49.1:
  Successfully uninstalled bitsandbytes-0.49.1
Found existing installation: accelerate 1.12.0
Uninstalling accelerate-1.12.0:
  Successfully uninstalled accelerate-1.12.0
Found existing installation: transformers 5.0.0
Uninstalling transformers-5.0.0:
  Successfully uninstalled transformers-5.0.0
[0mCollecting bitsandbytes
  Using cached bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Using cached bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1
Collecting transformers
  Using cached transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting accelerate
  Using cached accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Using cached peft-0.18.1-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Using cached trl-0.27.2-py3-none-any.whl.metadata (11 kB)
Using 

In [2]:
import os
os.environ['CUDA_HOME'] = '/usr/local/cuda'
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

dataset_name = "mlabonne/guanaco-llama2-1k"

new_model = "Llama-2-7b-chat-finetune"

################################
# QLORA Parameters
################################

lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

#bitsandbytes parameters

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

output_dir = "./results"

num_train_epochs = 1
fp16=False
bf16 = False
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim="paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

#SFT parameters

max_seq_length = None
packing = False
device_map = {"": 0}

In [4]:
import gc
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

dataset = load_dataset(dataset_name, split='train')

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Clear CUDA cache and perform garbage collection before loading the model
torch.cuda.empty_cache()
gc.collect()

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)


training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_arguments,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
25,1.617563
50,1.992517
75,1.379074
100,2.057465
125,1.453828
150,1.867601
175,1.398904
200,1.77173
225,1.352613
250,1.655301


TrainOutput(global_step=1000, training_loss=1.6052734718322754, metrics={'train_runtime': 426.4257, 'train_samples_per_second': 2.345, 'train_steps_per_second': 2.345, 'total_flos': 2644969139232768.0, 'train_loss': 1.6052734718322754})

In [5]:
trainer.model.save_pretrained(new_model)

In [6]:
logging.set_verbosity(logging.CRITICAL)

prompt = "what is a large language model?"
pipe = pipeline(task="text-generation",model=model,tokenizer=tokenizer,max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])


<s>[INST] what is a large language model? [/INST] A large language model (LLM) is an artificial intelligence (AI) model that can produce human-like text, such as natural language processing (NLP) models. It is usually trained on a large corpus of text data, which can be in the form of text files, web pages, or other types of data.

LLMs can be used for a variety of tasks, such as chatbots, language translation, and sentiment analysis. In NLP, they are often used to generate human-like responses to text prompts or question, such as a chatbot. 


In [7]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [10]:
from huggingface_hub import notebook_login

# This will create a prompt for your Hugging Face Token
# You can get a 'Write' token at: https://huggingface.co/settings/tokens
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
hf_username = "abhishekpatel10"
model_id = f"{hf_username}/Llama-2-7b-chat-finetune"

model.push_to_hub(model_id)
tokenizer.push_to_hub(model_id)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...6f3nrvq/model.safetensors:   5%|5         | 41.6MB /  825MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/abhishekpatel10/Llama-2-7b-chat-finetune/commit/414b89aaad887370204197edfe4c6765a6fd6c97', commit_message='Upload tokenizer', commit_description='', oid='414b89aaad887370204197edfe4c6765a6fd6c97', pr_url=None, repo_url=RepoUrl('https://huggingface.co/abhishekpatel10/Llama-2-7b-chat-finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='abhishekpatel10/Llama-2-7b-chat-finetune'), pr_revision=None, pr_num=None)