<a href="https://colab.research.google.com/github/balarampradhan181/CatalystAI/blob/main/TinyLlama_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install dependencies

# we use the latest version of transformers, peft, and accelerate
#!pip install -q -U peft transformers
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7


# we need sentencepiece for the llama2 slow tokenizer
!pip install -U sentencepiece

# we need einops, used by falcon-7b, llama-2 etc
# einops (einsteinops) is used to simplify tensorops by making them readable
!pip install -q -U einops

# we need to install datasets for our training dataset
!pip install -q -U datasets


#!pip install accelerate
#!pip install -i https://pypi.org/simple/ bitsandbytes



In [2]:
!pip install -U datasets huggingface_hub



In [3]:
from huggingface_hub import login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import os
os.environ["HF_TOKEN"] = "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"

In [5]:
import os
import torch
#from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer

In [6]:
# The model that you want to train from the Hugging Face hub
model_name = "aboonaji/llama2finetune-v2"

# The instruction dataset to use
dataset_name = "aboonaji/wiki_medical_terms_llam2_format"

# Fine-tuned model name
new_model = "balaramFineTuned"


In [7]:
!nvidia-smi -q -d Memory | grep -A4 GPU
!export CUDA_VISIBLE_DEVICES=0
!export CUDA_VISIBLE_DEVICE_MEMORY_LIMIT_MB=4096

Attached GPUs                             : 1
GPU 00000000:00:04.0
    FB Memory Usage
        Total                             : 15360 MiB
        Reserved                          : 257 MiB
        Used                              : 3 MiB


In [8]:
from datasets import load_dataset, Dataset
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['text'],
    num_rows: 6861
})

In [9]:
bnb_4bit_compute_dtype="float16"
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto",

        low_cpu_mem_usage=True,
        #return_dict=True,
        #torch_dtype=torch.float16,
        #llm_int8_enable_fp32_cpu_offload=True,  # Allow offloading to CPU

    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [10]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
compute_dtype

torch.float16

In [11]:
model, tokenizer = get_model_and_tokenizer(model_name)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
peft_config = LoraConfig(
        r=32, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )


In [13]:
training_arguments = TrainingArguments(
        output_dir=new_model,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="no",
        save_steps=0,
        #logging_steps=10,
        #num_train_epochs=1,
        max_steps=60,
        fp16=True,
        bf16=False,
        #warmup_ratio=0.03,
        group_by_length=True,
        report_to="none",
        # push_to_hub=True
    )

In [14]:
torch.cuda.memory_allocated()

3955516416

In [15]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=256
    )



In [16]:

# Increase the GPU memory available
!nvidia-smi -q -d Memory | grep -A4 GPU

# Set the PYTORCH_CUDA_ALLOC_CONF environment variable
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# Train the model
trainer.train()

Attached GPUs                             : 1
GPU 00000000:00:04.0
    FB Memory Usage
        Total                             : 15360 MiB
        Reserved                          : 257 MiB
        Used                              : 5097 MiB


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=60, training_loss=1.243011474609375, metrics={'train_runtime': 224.6238, 'train_samples_per_second': 1.068, 'train_steps_per_second': 0.267, 'total_flos': 1170445597655040.0, 'train_loss': 1.243011474609375, 'epoch': 0.03})

In [17]:
trainer.model.save_pretrained(new_model)

In [18]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is Paracetamol poisoning and explain in detail?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300, batch_size=1)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] What is Paracetamol poisoning and explain in detail? [/INST]  Paracetamol poisoning, also known as acetaminophen poisoning, occurs when a person ingests too much of the medication paracetamol (acetaminophen). Paracetamol is a common pain reliever and fever reducer found in many over-the-counter medications. everybody has a different tolerance to paracetamol, but taking too much can cause liver damage.

Paracetamol poisoning can occur in two ways:

1. Overdose: Taking more than the recommended dose of paracetamol can cause poisoning. The symptoms of paracetamol overdose usually appear within 4 to 12 hours after ingestion and may include nausea, vomiting, abdominal pain, headache, confusion, and in severe cases, liver damage.
2. Accidental ingestion: Children, especially those under the age of 5, are at risk of accidentally ingesting paracetamol. This can occur when a child accidentally gets into a medication that contains paracetamol.

Symptoms of paracetamol poisoning may inc

In [19]:
model_name

'aboonaji/llama2finetune-v2'

In [20]:
new_model

'balaramFineTuned'

In [21]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

0

In [22]:

import gc
import torch

# Clear unused resources
gc.collect()
torch.cuda.empty_cache()

# Environment variable to reduce memory fragmentation
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# Restart Jupyter kernel or Python environment to clear memory
# Ensure no other GPU-intensive processes are running

# Attempt to load model again with lower memory footprint
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,  # FP16 to save memory
    device_map="auto",  # Auto-distribute across GPUs
)

model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
model1="balarampradhan181/balaramFineTuned"
#model.push_to_hub(model1, use_temp_dir=False, do_sample=True) # Set do_sample to True
#model.config.temperature = None
#model.config.top_p = None
#print(model.config)
model.push_to_hub(model1, use_temp_dir=False, do_sample=False)
tokenizer.push_to_hub(model1, use_temp_dir=False)

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/balarampradhan181/balaramFineTuned/commit/86819b9b73f90ab9ed5c5f5419941c0d3e124065', commit_message='Upload tokenizer', commit_description='', oid='86819b9b73f90ab9ed5c5f5419941c0d3e124065', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
#model1="balarampradhan181/balaramFineTuned"
#model.push_to_hub(model1, use_temp_dir=False)
#tokenizer.push_to_hub(model1, use_temp_dir=False)