In [1]:
import sys
sys.executable

'/home/zineng/workspace/dev/miniconda3/envs/unsloth_env/bin/python'

In [2]:
import torch
print(torch.cuda.is_available())  # should be True
print(torch.cuda.get_device_name(0))  # should print your GPU name

True
NVIDIA GeForce RTX 4070 Ti SUPER


In [3]:
# Cell 1: Imports and base‐model loading
from unsloth import FastLanguageModel
import torch

# Configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load the pretrained base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.58 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
# Cell 2: Wrap with PEFT (LoRA) adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


Unsloth 2025.5.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
# Cell 3: Define the Alpaca‐style prompting template
alpaca_prompt = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Company database: {}

### Input:
SQL Prompt: {}

### Response:
SQL: {}
Explanation: {}
"""


In [6]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    company_databases = examples["sql_context"]
    prompts = examples["sql_prompt"]
    sqls = examples["sql"]
    explanations = examples["sql_explanation"]
    texts = []
    
    for company_database, prompt, sql, explanation in zip(company_databases, prompts, sqls, explanations):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(company_database, prompt, sql, explanation) + EOS_TOKEN
        texts.append(text)
    
    return { "text": texts }


In [7]:
from datasets import load_dataset

dataset = load_dataset("gretelai/synthetic_text_to_sql", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

In [8]:
dataset['text'][0]

"\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nCompany database: CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');\n\n### Input:\nSQL Prompt: What is the total volume of timber sold by each salesperson, sorted by salesperson?\n\n### Response:\nSQL: SELECT salesperson_id, name, SUM(volume) as total_volume FROM timber_sales JOIN salesperson ON timber_sales.salesperson_id = salesperson.salesperson_id GROUP BY salesperson_id, name ORDER BY total_volume DESC;\nExplanation: Joins

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,  # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    )
)


In [10]:
trainer_status = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.592
2,1.5855
3,1.5643
4,1.5162
5,1.3368
6,1.1838
7,1.0649
8,0.8765
9,0.836
10,0.8687


In [11]:
model.save_pretrained_gguf("not_quantized_model", tokenizer, quantization_method= 'not_quantized')

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.09 out of 15.07 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 53%|██████████████████████▊                    | 17/32 [00:00<00:00, 37.94it/s]
We will save to Disk and not RAM now.
100%|███████████████████████████████████████████| 32/32 [00:09<00:00,  3.42it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['bf16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at not_quantized_model into bf16 GGUF format.
The output location will be /home/zineng/workspace/ml_dev/nlp_ml/sql_finetuning/not_quantized_model/unsloth.BF16.gguf
This might take 3 minutes...
Traceback (most recent call last):
  File [35m"/home/zineng/workspace/ml_dev/nlp_ml/sql_finetuning/llama.cpp/convert_hf_to_gguf.py"[0m, line [35m19[0m, in [35m<module>[0m
    from transformers import AutoConfig
[1;35mModuleNotFoundError[0m: [35mNo module named 'transformers'[0m


RuntimeError: Unsloth: Quantization failed for /home/zineng/workspace/ml_dev/nlp_ml/sql_finetuning/not_quantized_model/unsloth.BF16.gguf
You might have to compile llama.cpp yourself, then run this again.
You do not need to close this Python program. Run the following commands in a new terminal:
You must run this in the same folder as you're saving your model.
git clone --recursive https://github.com/ggerganov/llama.cpp
cd llama.cpp && make clean && make all -j
Once that's done, redo the quantization.

In [None]:
%run llama.cpp/convert_hf_to_gguf.py not_quantized_model --outfile quantized.gguf --outtype q8_0
