## 1. Setup and Installation

In [1]:
!pip install -q unsloth torch torchvision torchaudio
!pip install -q datasets transformers peft bitsandbytes trl
!pip install -q huggingface-hub

print("Installation complete!")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.6/64.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m359.3/359.3 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.0/288.0 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

## 2. Login to Hugging Face

In [2]:
from huggingface_hub import login
import getpass

# You'll be prompted to paste your Hugging Face token
# Get one from: https://huggingface.co/settings/tokens
hf_token = getpass.getpass("Enter your Hugging Face token: ")
login(token=hf_token)

Enter your Hugging Face token: ··········


## 3. Load Base Model and Tokenizer

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Model selection - you can try different models
# Options: "meta-llama/Llama-3.1-1B", "meta-llama/Llama-3.1-3B", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# model_name = "meta-llama/Llama-3.1-1B"
model_name = "unsloth/Llama-3.2-1B-Instruct"

# 4-bit quantization config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded: {model_name}")
print(f"Model dtype: {model.dtype}")

config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Model loaded: unsloth/Llama-3.2-1B-Instruct
Model dtype: torch.float16


## 4. Setup LoRA Configuration

In [6]:
from peft import get_peft_model, LoraConfig, TaskType

# LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of the LoRA matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("LoRA configuration applied!")

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689
LoRA configuration applied!


## 5. Load and Prepare Dataset

In [12]:
from datasets import load_dataset

# Load FineTome instruction dataset
# dataset = load_dataset("FineTome/datasets", "finetome_general", split="train[:5000]")
dataset = load_dataset("mlabonne/FineTome-100k", split="train[:5000]")

print(f"Dataset size: {len(dataset)}")
print(f"First example: {dataset[0]}")

Dataset size: 5000
First example: {'conversations': [{'from': 'human', 'value': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programmin

## 6. Prepare Data for Training

In [13]:
def format_instruction(example):
    """Format instruction-following dataset for training."""
    instruction = example.get("instruction", "")
    input_text = example.get("input", "")
    output = example.get("output", "")

    # Create prompt
    if input_text:
        prompt = f"Instruction: {instruction}\n\nInput: {input_text}\n\nResponse: {output}"
    else:
        prompt = f"Instruction: {instruction}\n\nResponse: {output}"

    return {"text": prompt}

# Map formatting function to dataset
formatted_dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)

# Tokenize
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )

tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

# Split into train and validation
train_val_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Training set size: 4500
Validation set size: 500


## 7. Setup Training Configuration

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    # per_device_train_batch_size=8,
    per_device_train_batch_size=2,
    # per_device_eval_batch_size=8,
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    learning_rate=2e-4,
    bf16=True,  # Use bfloat16 if available
    optim="paged_adamw_8bit",
    seed=42,
    report_to="none", # Disable W&B reporting
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=None,  # Use default collator
)

print("Training configuration ready!")

Training configuration ready!


## 8. Train the Model

In [None]:
# Start training

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Re-define training_args with previous adjustments (batch size, no W&B reporting)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    learning_rate=2e-4,
    bf16=True,
    optim="paged_adamw_8bit",
    seed=42,
    report_to="none",
)

# Initialize a data collator for causal language modeling
# This ensures 'labels' are created from 'input_ids' for loss computation
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator, # Use the specialized data collator
)

trainer.train()

print("Training complete!")

Step,Training Loss,Validation Loss
200,1.577,1.577014
400,1.577,1.577006
600,1.577,1.577004
800,1.577,1.577003
1000,1.577,1.577002


## 9. Save Model Checkpoint

In [None]:
# Save LoRA weights
model.save_pretrained("./llama-3-1b-finetuned-lora")
tokenizer.save_pretrained("./llama-3-1b-finetuned-lora")

print("Model checkpoint saved locally!")

## 10. Upload to Hugging Face Hub

In [None]:
# Upload fine-tuned model to Hugging Face Hub
model_id = "llama-3-1b-finetuned-lora"
model.push_to_hub(repo_id=model_id, use_temp_dir=False)
tokenizer.push_to_hub(repo_id=model_id, use_temp_dir=False)

print(f"Model uploaded to: https://huggingface.co/{model_id}")

## 11. Save to Google Drive (Optional Backup)

In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Copy checkpoint to Drive
shutil.copytree(
    './llama-3-1b-finetuned-lora',
    '/content/drive/MyDrive/llama-3-1b-finetuned-lora',
    dirs_exist_ok=True
)

print("Model saved to Google Drive!")