In [3]:
# Standard installs
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets # If loading from Hugging Face Hub

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-a5s4shq_/unsloth_a092cada8c7a44328dd39cc59ffe1186
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-a5s4shq_/unsloth_a092cada8c7a44328dd39cc59ffe1186
  Resolved https://github.com/unslothai/unsloth.git to commit c9b9a366e7a6110f9d58d5ed8db6bd27bc97fb71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.3.17 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.g

Collecting xformers
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.29.post3


In [5]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
# from peft import LoraConfig # Not strictly needed for basic config
import os

# Hugging Face Login (Required for Llama 3 models)
from huggingface_hub import login
# IMPORTANT: Replace "hf_YOUR_HUGGINGFACE_TOKEN" with your actual HF token
# Get one from https://huggingface.co/settings/tokens
try:
    login("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", add_to_git_credential=False)
    print("Hugging Face login successful.")
except Exception as e:
    print(f"Hugging Face login failed: {e}")
    print("Please ensure you have provided a valid Hugging Face token.")

print("=== Imports and Login Complete ===")

Hugging Face login successful.
=== Imports and Login Complete ===


In [6]:
# Define major parameters
max_seq_length = 2048 # Adjust based on your VRAM and typical code length. Llama 3.1 supports up to 8192.
dtype = None # Auto-detect. Or torch.float16, torch.bfloat16 if specific needed.
load_in_4bit = True # Use 4-bit quantization for memory efficiency.

# Define the model name (Using the specific Unsloth 4-bit version)
model_name = "unsloth/llama-3.1-8b-Instruct-bnb-4bit"

print(f"Configuration:")
print(f"  Model Name: {model_name}")
print(f"  Max Sequence Length: {max_seq_length}")
print(f"  Dtype: {'Auto' if dtype is None else dtype}")
print(f"  Load in 4-bit: {load_in_4bit}")
print("=== Configuration Set ===")

Configuration:
  Model Name: unsloth/llama-3.1-8b-Instruct-bnb-4bit
  Max Sequence Length: 2048
  Dtype: Auto
  Load in 4-bit: True
=== Configuration Set ===


In [7]:
import time
start_time = time.time()
print("Loading model and tokenizer...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # Can pass token here again if login in Cell 2 failed
)

end_time = time.time()
print(f"Model and tokenizer loaded in {end_time - start_time:.2f} seconds.")
print("=== Model and Tokenizer Loaded ===")

Loading model and tokenizer...
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model and tokenizer loaded in 13.78 seconds.
=== Model and Tokenizer Loaded ===


In [8]:
print("Configuring LoRA adapters...")

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # LoRA rank (suggested: 8, 16, 32)
    lora_alpha = 32, # LoRA alpha (often 2*r)
    lora_dropout = 0, # Set to 0 for Unsloth's fast patching compatibility (or 0.05 for regularization)
    bias = "none", # Use "none", "all", or "lora_only"
    use_gradient_checkpointing = True, # Recommended for memory savings
    random_state = 3407, # For reproducibility
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",], # Unsloth automatically finds these for most models
    # Optional: Add modules to save to merge later (e.g., for GGUF export)
    # lora_modules_to_save = ["embed_tokens", "lm_head",],
)

print("LoRA configured:")
print(model.print_trainable_parameters()) # Show trainable parameters
print("=== LoRA Configuration Complete ===")

Configuring LoRA adapters...


Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


LoRA configured:
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
None
=== LoRA Configuration Complete ===


In [10]:
# Using the alternative dataset: TokenBender/code_instructions_120k
dataset_name = "HuggingFaceH4/CodeAlpaca_20K"
print(f"Loading dataset: {dataset_name}...")

try:
    dataset = load_dataset(dataset_name, split="train")
    print("Dataset loaded successfully.")
    # *** CRITICAL: INSPECT THE FEATURES TO CONFIRM COLUMN NAMES ***
    print("\nDataset features (column names):")
    print(dataset.features)
    print(f"\nNumber of examples: {len(dataset)}")
    print("\nFirst example:")
    print(dataset[0])
except Exception as e:
    print(f"Error loading dataset '{dataset_name}': {e}")
    print("Please double-check the dataset name and your internet connection.")
    # Stop execution if dataset fails to load
    raise

print("=== Dataset Loading Complete ===")

Loading dataset: HuggingFaceH4/CodeAlpaca_20K...


README.md:   0%|          | 0.00/195 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/756 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/3.01M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/336k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18019 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2003 [00:00<?, ? examples/s]

Dataset loaded successfully.

Dataset features (column names):
{'prompt': Value(dtype='string', id=None), 'completion': Value(dtype='string', id=None)}

Number of examples: 18019

First example:
{'prompt': 'Create a Java class which sorts the given array of numbers.\n[9, 2, 4, 3, 6, 1]', 'completion': 'class ArraySort { \n  \n    void sort(int arr[]) { \n        int n = arr.length; \n  \n        // One by one move boundary of unsorted subarray \n        for (int i = 0; i < n-1; i++) { \n            \n            // Find the minimum element in unsorted array \n            int min_index = i; \n            for (int j = i+1; j < n; j++) \n                if (arr[j] < arr[min_index]) \n                    min_index = j; \n  \n            // Swap the found minimum element with the first element \n            int temp = arr[min_index]; \n            arr[min_index] = arr[i]; \n            arr[i] = temp; \n        } \n    } \n  \n    // Prints the array \n    void printArray(int arr[]) { \n    

In [11]:
# IMPORTANT: Based on the output of Cell 6, verify these column names are correct!
# Common names in 'TokenBender/code_instructions_120k' seem to be 'prompt' and 'completion'. Adjust if necessary.
instruction_col = "prompt"      # Adjust if your dataset uses e.g., "instruction"
input_col = None                # Set to "input" if your dataset has a separate input column, otherwise None
output_col = "completion"       # Adjust if your dataset uses e.g., "response", "output"

print(f"Using columns - Instruction: '{instruction_col}', Input: '{input_col}', Output: '{output_col}'")
if instruction_col not in dataset.features or output_col not in dataset.features or (input_col is not None and input_col not in dataset.features):
     print("\n*** WARNING: One or more specified column names are not in the dataset features found in Cell 6! Please correct them before proceeding. ***\n")


# We need the tokenizer loaded (from Cell 4) to use apply_chat_template
def formatting_prompts_func(examples):
    instructions = examples[instruction_col]
    inputs = examples[input_col] if input_col else [None] * len(instructions)
    outputs = examples[output_col]
    texts = []

    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Combine instruction and input for the user prompt
        user_content = instruction
        if input_text is not None and str(input_text).strip(): # Check if input exists and is not empty
            user_content += "\n" + str(input_text) # Append input if it exists

        # Create the message list for the chat template
        messages = [
            {"role": "system", "content": "You are a precise and efficient coding assistant. Provide only the requested code or explanation without unnecessary chatter."},
            {"role": "user", "content": user_content.strip()},
            {"role": "assistant", "content": output} # The ground truth code/response
        ]

        # Apply the chat template. add_generation_prompt=False because we provide the full convo.
        try:
            formatted_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            texts.append(formatted_text)
        except Exception as e:
            print(f"Error formatting example: {e}")
            print(f"Instruction: {instruction}")
            print(f"Input: {input_text}")
            print(f"Output: {output}")
            # Append empty string or handle error as appropriate
            texts.append("") # Or skip the example

    return {"text": texts}

print("Formatting function defined.")
# Optional: Test formatting on one example
# test_example = dataset[5] # Try a different example
# formatted_result = formatting_prompts_func({k: [v] for k,v in test_example.items()})
# if formatted_result['text']:
#    print("\n--- Example Formatted Text ---")
#    print(formatted_result['text'][0])
# else:
#    print("\n--- Formatting test failed for example ---")

print("=== Formatting Function Defined ===")

Using columns - Instruction: 'prompt', Input: 'None', Output: 'completion'
Formatting function defined.
=== Formatting Function Defined ===


In [12]:
print("Applying formatting to the dataset...")
print("This may take a few minutes depending on dataset size...")
start_map_time = time.time()

# Use batched=True for speed. num_proc can be increased if CPU allows.
dataset = dataset.map(
    formatting_prompts_func,
    batched = True,
    # num_proc = 4, # Adjust based on your Colab CPU cores (usually 2 is safe)
    remove_columns = list(dataset.features), # Remove original columns to save memory
)

end_map_time = time.time()
print(f"Dataset formatting complete in {end_map_time - start_map_time:.2f} seconds.")

# Verify the structure and content of the formatted dataset
print("Dataset features after formatting:", dataset.features)
if 'text' in dataset.features and len(dataset) > 0:
    print("\nExample formatted entry (first 500 chars):")
    print(dataset[0]['text'][:500])
else:
    print("\nWarning: 'text' column might be missing or dataset is empty after formatting.")

print("=== Dataset Preparation Complete ===")

Applying formatting to the dataset...
This may take a few minutes depending on dataset size...


Map:   0%|          | 0/18019 [00:00<?, ? examples/s]

Dataset formatting complete in 1.20 seconds.
Dataset features after formatting: {'text': Value(dtype='string', id=None)}

Example formatted entry (first 500 chars):
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a precise and efficient coding assistant. Provide only the requested code or explanation without unnecessary chatter.<|eot_id|><|start_header_id|>user<|end_header_id|>

Create a Java class which sorts the given array of numbers.
[9, 2, 4, 3, 6, 1]<|eot_id|><|start_header_id|>assistant<|end_header_id|>

class ArraySort { 
  
    void sort(int arr[]) { 
        int n 
=== Dataset Preparation Complete ===


In [13]:
from trl import SFTTrainer
from transformers import TrainingArguments

output_directory = "llama3_coding_finetune_adapters_run1" # Give a specific name

print(f"Configuring SFTTrainer. Output directory: {output_directory}")

trainer = SFTTrainer(
    model = model,                     # The LoRA-configured model
    tokenizer = tokenizer,             # The model's tokenizer
    train_dataset = dataset,           # The formatted training dataset
    dataset_text_field = "text",       # The column containing the formatted text
    max_seq_length = max_seq_length,   # Max sequence length for packing/padding
    dataset_num_proc = 2,              # Number of workers for dataset processing
    packing = False,                   # Disable packing for simplicity first. Set True for potential speedup if sequences vary a lot.

    args = TrainingArguments(
        per_device_train_batch_size = 2,  # Reduce this if you encounter CUDA Out-of-Memory errors
        gradient_accumulation_steps = 8,  # Increase this to effectively increase batch size (2 * 8 = 16 effective batch size)
        warmup_steps = 10,                # Number of steps for learning rate warmup
        # max_steps = 100,                # Optional: Uncomment to train for a fixed number of steps (good for quick tests)
        num_train_epochs = 1,             # Number of times to iterate over the dataset (start with 1)
        learning_rate = 2e-4,             # Standard learning rate for LoRA (can be tuned)
        fp16 = not torch.cuda.is_bf16_supported(), # Use fp16 if bf16 is not available
        bf16 = torch.cuda.is_bf16_supported(),     # Use bf16 if available (A100, H100 GPUs) for better speed/stability
        logging_steps = 10,               # Log training loss every 10 steps
        optim = "adamw_8bit",             # Use 8-bit AdamW optimizer to save memory
        weight_decay = 0.01,              # Regularization parameter
        lr_scheduler_type = "linear",     # Learning rate decay strategy
        seed = 3407,                      # Seed for reproducibility
        output_dir = output_directory,    # Where to save checkpoints and logs
        save_strategy = "steps",          # Save checkpoints at specific step intervals
        save_steps = 50,                  # Save a checkpoint every 50 steps (adjust based on dataset size and training length)
        report_to="tensorboard",          # Log to tensorboard (optional)
    ),
)

print("Trainer configured.")
# Check GPU memory before starting (optional)
if torch.cuda.is_available():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
    print(f"Pre-Train GPU: {gpu_stats.name}. Max memory reserved: {start_gpu_memory} GB.")
else:
    print("No GPU detected. Training will be very slow.")

print("=== Trainer Configuration Complete ===")

Configuring SFTTrainer. Output directory: llama3_coding_finetune_adapters_run1


Map (num_proc=2):   0%|          | 0/18019 [00:00<?, ? examples/s]

Trainer configured.
Pre-Train GPU: NVIDIA L4. Max memory reserved: 11.01 GB.
=== Trainer Configuration Complete ===


In [14]:
import gc

print("Starting training... This might take a while.")
start_train_time = time.time()

# Clear some memory before training starts
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cleared CUDA cache.")

# Start training
trainer_stats = trainer.train()

end_train_time = time.time()
print(f"Training finished in {(end_train_time - start_train_time)/60:.2f} minutes.")

# Analyze memory usage post-training (optional)
if torch.cuda.is_available():
    used_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
    used_memory_for_lora = round(used_gpu_memory - start_gpu_memory, 3) if 'start_gpu_memory' in locals() else used_gpu_memory
    print(f"\nPost-Train Peak reserved memory: {used_gpu_memory} GB.")
    if 'start_gpu_memory' in locals():
         print(f"Approx. memory used for training artifacts: {used_memory_for_lora} GB.")
else:
    print("Training completed (CPU).")

print("\nTraining stats:", trainer_stats)
print("=== Training Complete ===")

Starting training... This might take a while.
Cleared CUDA cache.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 18,019 | Num Epochs = 1 | Total steps = 1,126
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.0587
20,0.7099
30,0.5379
40,0.5449
50,0.5269
60,0.5125
70,0.5156
80,0.5255
90,0.4995
100,0.5237


Training finished in 111.76 minutes.

Post-Train Peak reserved memory: 11.01 GB.
Approx. memory used for training artifacts: 0.0 GB.

Training stats: TrainOutput(global_step=1126, training_loss=0.5075154452721984, metrics={'train_runtime': 6703.1298, 'train_samples_per_second': 2.688, 'train_steps_per_second': 0.168, 'total_flos': 1.331646015281234e+17, 'train_loss': 0.5075154452721984, 'epoch': 0.9997780244173141})
=== Training Complete ===


In [15]:
final_adapter_dir = f"{output_directory}/final_adapters"
print(f"\nSaving final LoRA adapters to: {final_adapter_dir}")

# Save the trained LoRA adapters
model.save_pretrained(final_adapter_dir)

# Save the tokenizer as well (good practice)
tokenizer.save_pretrained(final_adapter_dir)

print(f"Adapters and tokenizer saved to {final_adapter_dir}.")
# You can find these files in the Colab file browser under the output directory specified in Cell 9.
print("=== Adapters Saved ===")


Saving final LoRA adapters to: llama3_coding_finetune_adapters_run1/final_adapters
Adapters and tokenizer saved to llama3_coding_finetune_adapters_run1/final_adapters.
=== Adapters Saved ===


In [16]:
import warnings
warnings.filterwarnings("ignore") # Suppress minor generation warnings

print("\nRunning Inference Test...")

# Ensure model is ready for inference
# If you stopped/restarted the notebook, you would first need to reload the base model (Cell 4)
# and then load the adapters using PeftModel:
# from peft import PeftModel
# print("Reloading base model for inference...")
# base_model, tokenizer = FastLanguageModel.from_pretrained(...) # Reload from Cell 4 config
# print("Loading saved adapters...")
# model = PeftModel.from_pretrained(base_model, final_adapter_dir) # final_adapter_dir from Cell 11
# print("Model with adapters reloaded.")

# Prepare the model for faster inference if it's still in memory from training
FastLanguageModel.for_inference(model)
model.eval() # Set model to evaluation mode

# --- Define your test prompt ---
test_instruction = "Write a Python function that takes a list of numbers and returns a new list containing only the even numbers."
test_input = "" # No separate input for this example

# --- Format the prompt using the chat template ---
messages = [
    {"role": "system", "content": "You are a precise and efficient coding assistant. Provide only the requested code or explanation without unnecessary chatter."},
    {"role": "user", "content": f"{test_instruction}\n{test_input}".strip()}
    # NO assistant message here - this is what the model should generate
]

# Tokenize the formatted prompt, adding the prompt structure expected by the model for generation
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # IMPORTANT: True for inference
    return_tensors = "pt"
).to("cuda" if torch.cuda.is_available() else "cpu") # Move inputs to GPU if available

# --- Set generation parameters ---
generation_params = {
    "max_new_tokens": 250,     # Max tokens to generate for the answer
    "use_cache": True,         # Speeds up generation
    "do_sample": True,         # Use sampling for more 'creative' answers
    "temperature": 0.6,        # Controls randomness (lower = more deterministic)
    "top_p": 0.9,              # Nucleus sampling (limits vocab sample pool)
    "eos_token_id": tokenizer.eos_token_id, # End of sequence token ID
    "pad_token_id": tokenizer.eos_token_id, # Use EOS token for padding in open-ended generation
}

# --- Generate the response ---
print("\nGenerating response...")
with torch.no_grad(): # Disable gradient calculation for inference
    outputs = model.generate(inputs, **generation_params)

# Decode the generated tokens, skipping the prompt tokens
# The generated output includes the input prompt, so we slice it off
response_tokens = outputs[0][len(inputs[0]):]
response = tokenizer.decode(response_tokens, skip_special_tokens=True)

# --- Display the results ---
print("\n--- Prompt Sent to Model (Formatted) ---")
prompt_text = tokenizer.decode(inputs[0], skip_special_tokens=False)
# Try to display only up to the assistant tag for clarity
assistant_prompt_start = "<|start_header_id|>assistant<|end_header_id|>\n\n"
print(prompt_text.split(assistant_prompt_start)[0] + assistant_prompt_start)

print("\n--- Generated Response ---")
print(response)

# Clean up memory if running multiple tests
del inputs, outputs
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n=== Inference Test Complete ===")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Running Inference Test...

Generating response...

--- Prompt Sent to Model (Formatted) ---
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a precise and efficient coding assistant. Provide only the requested code or explanation without unnecessary chatter.<|eot_id|><|start_header_id|>user<|end_header_id|>

Write a Python function that takes a list of numbers and returns a new list containing only the even numbers.<|eot_id|><|start_header_id|>assistant<|end_header_id|>



--- Generated Response ---
def even_numbers(numbers):
    return [num for num in numbers if num % 2 == 0]

=== Inference Test Complete ===
