<a href="https://colab.research.google.com/github/balarampradhan181/CatalystAI/blob/main/TinyLlama_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies

# we use the latest version of transformers, peft, and accelerate
!pip install -q -U peft transformers

# install bitsandbytes for quantization
#!pip install -q -U bitsandbytes

# install trl for the SFT library
!pip install -q -U trl

# we need sentencepiece for the llama2 slow tokenizer
!pip install -U sentencepiece

# we need einops, used by falcon-7b, llama-2 etc
# einops (einsteinops) is used to simplify tensorops by making them readable
!pip install -q -U einops

# we need to install datasets for our training dataset
!pip install -q -U datasets

#!pip install huggingface_hub
# Install accelerate
!pip install -U accelerate

# Install bitsandbytes from PyPI
!pip install -i https://pypi.org/simple/ bitsandbytes

#!pip install accelerate
#!pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [None]:
!pip install -U datasets huggingface_hub



In [None]:
#from huggingface_hub import login
#login("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")

In [None]:
import os
os.environ["HF_TOKEN"] = "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"

In [None]:
import os
import torch
#from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer



In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "aboonaji/llama2finetune-v2"

# The instruction dataset to use
dataset_name = "aboonaji/wiki_medical_terms_llam2_format"

# Fine-tuned model name
new_model = "balarampradhan181/llama2finetune"


In [None]:
!nvidia-smi -q -d Memory | grep -A4 GPU
!export CUDA_VISIBLE_DEVICES=0
!export CUDA_VISIBLE_DEVICE_MEMORY_LIMIT_MB=4096

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
from datasets import load_dataset, Dataset
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['text'],
    num_rows: 6861
})

In [None]:
bnb_4bit_compute_dtype="float16"
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto",

        low_cpu_mem_usage=True,
        #return_dict=True,
        #torch_dtype=torch.float16,
        #llm_int8_enable_fp32_cpu_offload=True,  # Allow offloading to CPU

    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
compute_dtype

torch.float16

Looking in indexes: https://pypi.org/simple/


In [None]:
model, tokenizer = get_model_and_tokenizer(model_name)



ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
peft_config = LoraConfig(
        r=32, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )


In [None]:
training_arguments = TrainingArguments(
        output_dir=new_model,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="no",
        save_steps=0,
        #logging_steps=10,
        #num_train_epochs=1,
        max_steps=60,
        fp16=False,
        bf16=False,
        #warmup_ratio=0.03,
        group_by_length=True,
        report_to="none",
        # push_to_hub=True
    )

In [None]:
torch.cuda.memory_allocated()

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=512
    )

In [None]:
# Train model
trainer.train()

In [None]:
trainer.model.save_pretrained(new_model)

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is Paracetamol poisoning and explain in detail?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300, batch_size=1)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
model_name

In [None]:
new_model

In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
