## Installing unsloth

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

## Defining working directory

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y-%m-%d-%H%M")
WORKSPACE = "/workspace"

## Loading the LLM

Download the LLM model from Hugging Face, with safetensors format

In [None]:
from unsloth import FastLanguageModel

# Setting the parameters
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Loading LLM from Hugging Face
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit',
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

### Adding LoRA adapters

docs: https://github.com/unslothai/unsloth/wiki#lora-parameters-encyclopedia

In [None]:
# Applying LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj", ],
    lora_alpha=16,  # the higher the number is, the more weight changes
    lora_dropout=0,  # how much information will retain in the weight updating process
    bias="none",  # specifies whether the lora layers that we are updating should learn bias (memory saving technic)
    use_gradient_checkpointing="unsloth",
    # saves memory, by recomputing the activation instead of storing (useful on long datasets)
    random_state=3407,  # Ramdom seed
    use_rslora=False,
    loftq_config=None,  # low bit fine-tuning quantization (disable)
)

---

## Running inference before fine-tuning

In [None]:
from transformers import TextStreamer

# Messages
question = "O que é a minima?"
messages = [{"role": "user", "content": question}]

# Enable optimizes inference mode for unsloth models (improves speed and efficiency)
FastLanguageModel.for_inference(model)

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

# Create a text streamer to stream the output
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

# Generate the response using the model
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=2048,
    use_cache=True,
    min_p=0.1
)

---

# Dataset

## Loading the dataset

In [None]:
from datasets import load_dataset, concatenate_datasets
from unsloth.chat_templates import standardize_sharegpt

# Synthetic datasets generated with instructlab
KNOWLEDGE_DATASET = "/workspace/datasets/2025-09-12_004739/knowledge_train_msgs_2025-09-12T00_51_21.jsonl"
SKILLS_DATASET = "/workspace/datasets/2025-09-12_004739/skills_train_msgs_2025-09-12T00_51_21.jsonl"

# Loading synthetic dataset
knowledge_ds = load_dataset(
    path="json",
    data_files=KNOWLEDGE_DATASET,
    split="train",
)
skills_ds = load_dataset(
    path="json",
    data_files=SKILLS_DATASET,
    split="train",
)

# Concatenate both datasets
combined_ds = concatenate_datasets([knowledge_ds, skills_ds])
combined_ds = combined_ds.shuffle(seed=3407)

### Applying the chat template

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.2",
)

## Standardizing the dataset

Now we need to standardize the dataset to the ShareGPT format, and format the messages to include a system prompt.

In [None]:
# Formating chat messages
def formatting_prompts(examples):
    messages = examples["messages"]

    system_message = (
        "You are Minima's expert assistant. You have deep knowledge of the Minima Innovation Studio, "
        "its methodology, success cases, and strategic value. Help users understand what we do and why we do it."
    )

    # Replacing system message
    updated_messages = []
    for chat in messages:
        # Filtering out existing system messages
        non_system_messages = [
            msg
            for msg in chat
            if msg["role"] != "system"
        ]

        # Prepend the new system message
        new_chat = [{"role": "system", "content": system_message}] + non_system_messages

        updated_messages.append(new_chat)

    # Formatting using tokenizer
    texts = [
        tokenizer.apply_chat_template(
            message,
            tokenize=False,
            add_generation_prompt=False
        )
        for message in updated_messages
    ]

    return {"text": texts}

## Updating the loaded dataset

In [None]:
# Standardize dataset
standardize_ds = standardize_sharegpt(combined_ds)
standardize_ds = standardize_ds.map(formatting_prompts, batched=True)

# E.g.
standardize_ds[2]

---

# Trainning

## Creating the fine-tuning trainer

In [None]:
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported
from transformers import DataCollatorForSeq2Seq

# Initialize the supervised fine-tuning trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=standardize_ds,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    packing=False,  # Can make training 5x faster for short sequences

    # Defining the training args
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,  # how many step accumulate before update weight
        num_train_epochs=1,  # Set this for 1 full training run
        warmup_steps=100,  # Gradually increase the learning rate for the first 5 steps
        # max_steps = 60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=100,
        optim="adamw_8bit",
        weight_decay=0.01,  # Allow regularization to prevent overfitting
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir=f"{WORKSPACE}/outputs",
        report_to="none",  # Enable WandB later
    )
)

# Trainning


In [None]:
# Start the fine-tuning process
trainer_stats = trainer.train()

## Testing the fine-tuned model

In [None]:
# Messages
question = "O que é a minima?"
messages = [{"role": "user", "content": question}]

# Enable optimizes inference mode for unsloth models (improves speed and efficiency)
FastLanguageModel.for_inference(model)

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=2048,
    use_cache=True,
    min_p=0.1
)

## Saving to VLLM

To save to 16bit for VLLM

In [None]:
# 16bit merged model
model_dir = f"{WORKSPACE}/model"
model.save_pretrained_merged(model_dir, tokenizer, save_method ="merged_16bit", )

# LoRA adapters
model_lora_dir = f"{WORKSPACE}/model_lora"
model.save_pretrained_merged(model_lora_dir, tokenizer, save_method ="lora", )

## Saving to GGUF

Saving models to 16bit for GGUF

In [None]:
# Output directory
model_gguf_dir = f"{WORKSPACE}/model_GGUF"

# Saving the model in GGUF format
model.save_pretrained_gguf(model_gguf_dir, tokenizer, quantization_method ="q4_k_m")

## Modelfile

Saving the Modelfile for GGUF to use with Ollama

In [None]:
import os

# folder path
output_dir = f"{WORKSPACE}/fine-tuning/{TIMESTAMP}"
os.makedirs(output_dir, exist_ok=True)

# Modelfile path
modelfile_path = os.path.join(output_dir, "Modelfile")

with open(modelfile_path, "w") as f:
    f.write(tokenizer._ollama_modelfile)

print(f"✅ Modelfile written to {modelfile_path}")