In [2]:
from transformers import AutoTokenizer, LlamaForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from datasets import load_dataset

import torch
from lora_scratch import (
    LinearLoRA,
    create_lora,
    add_lora_layers,
    freeze_model,
    unfreeze_model,
    create_linear,
    merge_lora_layers,
)
# from peft import LoraConfig, PeftModel
# from trl import SFTTrainer

In [3]:
# llama_model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
# llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

llama_base_model = LlamaForCausalLM.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
llama_tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [03:09<00:00, 94.85s/it] 


In [4]:
import torch 
class ExtendedModel(torch.nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        self.lora = LinearLoRA(base_model.config.hidden_size, base_model.config.hidden_size, r=8, lora_alpha=16, lora_dropout=0.1)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        lora_output = self.lora(last_hidden_state)
        return lora_output
llama_lora_model = ExtendedModel(llama_base_model)


In [5]:
freeze_model(llama_lora_model)
namess=[]
for name, param in llama_lora_model.named_parameters():
    namess.append(name)

print(namess)
n_params = 0
n_trainable_params = 0
for n, p in llama_lora_model.named_parameters():
    n_params += p.numel()
    if p.requires_grad:
        n_trainable_params += p.numel()

print(f"Total parameters: {n_params}")
print(f"Trainable parameters: {n_trainable_params}")
print(f"Percentage trainable: {round(n_trainable_params / n_params * 100, 6)}%")

['base_model.model.embed_tokens.weight', 'base_model.model.layers.0.self_attn.q_proj.weight', 'base_model.model.layers.0.self_attn.k_proj.weight', 'base_model.model.layers.0.self_attn.v_proj.weight', 'base_model.model.layers.0.self_attn.o_proj.weight', 'base_model.model.layers.0.mlp.gate_proj.weight', 'base_model.model.layers.0.mlp.up_proj.weight', 'base_model.model.layers.0.mlp.down_proj.weight', 'base_model.model.layers.0.input_layernorm.weight', 'base_model.model.layers.0.post_attention_layernorm.weight', 'base_model.model.layers.1.self_attn.q_proj.weight', 'base_model.model.layers.1.self_attn.k_proj.weight', 'base_model.model.layers.1.self_attn.v_proj.weight', 'base_model.model.layers.1.self_attn.o_proj.weight', 'base_model.model.layers.1.mlp.gate_proj.weight', 'base_model.model.layers.1.mlp.up_proj.weight', 'base_model.model.layers.1.mlp.down_proj.weight', 'base_model.model.layers.1.input_layernorm.weight', 'base_model.model.layers.1.post_attention_layernorm.weight', 'base_model.m

In [10]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "Llama-2-7b-chat-finetune"

# QLoRA parameters
# lora_r = 64
# lora_alpha = 16
# lora_dropout = 0.1

# bitsandbytes parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# TrainingArguments parameters
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

# SFT parameters
max_seq_length = None
packing = False
device_map = {"": 0}

In [12]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    llama_lora_model,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
# peft_config = LoraConfig(
#     lora_alpha=lora_alpha,
#     lora_dropout=lora_dropout,
#     r=lora_r,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    # peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

Downloading readme: 100%|██████████| 1.02k/1.02k [00:00<00:00, 7.71MB/s]
Downloading data: 100%|██████████| 967k/967k [00:00<00:00, 3.27MB/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 74227.59 examples/s]


AssertionError: Torch not compiled with CUDA enabled