In [1]:
from huggingface_hub import login
login(token="Your_HF_Token")

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "unsloth/Llama-3.2-3B-Instruct"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, 
    bnb_4bit_use_double_quant=True,
)

# Load Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto", # Automatically puts model on GPU
)
model.config.use_cache = False

#
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
# tll the model the vocabulary size just increased by 1
model.resize_token_embeddings(len(tokenizer))
# Configure the model to use this new token for padding
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "right" 
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [3]:
import torch
import os
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)

def process_data(examples):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    
    for instruction, output in zip(examples["instruction"], examples["output"]):
        # creat Full Text
        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output}
        ]
        full_text = tokenizer.apply_chat_template(messages, tokenize=False)
        
        # Tokenize (Add EOS token explicitly to be safe)
        full_tokenized = tokenizer(
            full_text, 
            truncation=True, 
            max_length=512, 
            padding="max_length"
        )
        
        input_ids = full_tokenized["input_ids"]
        attention_mask = full_tokenized["attention_mask"]
        
        prompt_messages = [{"role": "user", "content": instruction}]
        prompt_text = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
        prompt_text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
        prompt_tokenized = tokenizer(prompt_text, truncation=True, max_length=512, padding=False)
        prompt_len = len(prompt_tokenized["input_ids"])

        labels = list(input_ids)
        
        for i in range(len(labels)):
            # mask prmpt
            if i < prompt_len:
                labels[i] = -100
            
            # mask Padding, but keep  EOS token
            elif attention_mask[i] == 0:
                labels[i] = -100
                
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

print("Processing dataset...")
dataset = load_dataset("json", data_files="debug_dataset.json", split="train")
tokenized_dataset = dataset.map(process_data, batched=True, remove_columns=dataset.column_names)

model = prepare_model_for_kbit_training(model)

Processing dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [4]:
# Define LoRA Config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj"]
)

# Attach Adapters
model = get_peft_model(model, peft_config)

# Verify trainable parameters (Should be ~1-2% of total params), stop if not!
model.print_trainable_parameters()

# trainer params

training_args = TrainingArguments(
    output_dir="./results_riddles",
    per_device_train_batch_size=1, # Lowest possible to save memory
    gradient_accumulation_steps=4, # Accumulate to simulate batch size of 4
    learning_rate=5e-5,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    logging_steps=10,
    fp16=True, # Use mixed precision
    num_train_epochs=3,
    save_strategy="no", # Don't fill disk with checkpoints
    optim="paged_adamw_32bit", # Saves memory
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

trainable params: 24,313,856 || all params: 3,237,066,752 || trainable%: 0.7511


In [5]:
def test_question(question, model):
    test_input = question

    # format the prompt
    prompt_str = tokenizer.apply_chat_template(
        [{"role": "user", "content": test_input}],
        tokenize=False,
        add_generation_prompt=True
    )
    
    # tokenize
    inputs = tokenizer(prompt_str, return_tensors="pt").to("cuda")
    
    print("Generatig")
    
    # Define what counts as "Stopping"
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            eos_token_id=terminators,
            max_new_tokens=100,
            #do_sample=True,
            #temperature=0,#0.6,
            do_sample=False,       #Disable randomness completely
            temperature=None,      #temp. is ignored when do_sample=False
            top_p=None,
            pad_token_id=tokenizer.pad_token_id 
        )
    
    # Decode
    input_length = inputs.input_ids.shape[1]
    new_tokens = outputs[0][input_length:]
    
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    print("\n" + "="*30)
    print(f"INPUT: {test_input}")
    print("="*30)
    print(f"MODEL ANSWER:\n{response}")
    print("="*30)


test_question("What is fire?", model)

Generatig

INPUT: What is fire?
MODEL ANSWER:
Fire is a chemical reaction that involves the rapid oxidation of a fuel source, typically in the presence of oxygen, resulting in the release of heat, light, and various byproducts. This process is known as combustion.

The basic components of fire are:

1. Fuel: This can be any combustible material, such as wood, gasoline, propane, or even organic matter like paper or cloth.
2. Oxygen: Oxygen is necessary to sustain the combustion process. In most cases, this comes from the


In [6]:
print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss
10,5.2883
20,3.1386
30,1.6359
40,0.993
50,0.6979
60,0.5977
70,0.5399
80,0.5151
90,0.4766
100,0.4142


TrainOutput(global_step=375, training_loss=0.4835618240038554, metrics={'train_runtime': 745.1658, 'train_samples_per_second': 2.013, 'train_steps_per_second': 0.503, 'total_flos': 1.3100826230784e+16, 'train_loss': 0.4835618240038554, 'epoch': 3.0})

In [7]:
## comment this cell out and see what happens :)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128257, 3072, padding_idx=128004)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
       

In [8]:
model.save_pretrained("my_llm")



In [9]:
test_question("What is fire?", model)

Generatig

INPUT: What is fire?
MODEL ANSWER:
Arey Fire is acha very hot so simple
