In [None]:
# ==============================
# 0) Cài thư viện (Colab)
# ==============================
!pip install -qq --upgrade pip
!pip install -qq --upgrade peft transformers accelerate bitsandbytes datasets trl huggingface_hub evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch
import numpy as np
import evaluate
import json

from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model, get_peft_config
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset as hf_load_dataset
from trl import SFTConfig, SFTTrainer
import warnings
from typing import Any, Dict, List

warnings.filterwarnings("ignore")

# Load model

In [None]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [None]:
# model_name = "meta-llama/Llama-3.2-1B-Instruct"
model_name = "Qwen/Qwen3-0.6B"
# model_name = "Qwen/Qwen3-4B"

model, tokenizer = load_model(model_name)

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    print("Pad token is not set. Setting it to EOS token.")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
else:
    print(f'Pad token: {tokenizer.pad_token}')
    print(f'Pad token id: {tokenizer.pad_token_id}')

print(f'EOS token: {tokenizer.eos_token}')
print(f'EOS token id: {tokenizer.eos_token_id}')

Pad token: <|endoftext|>
Pad token id: 151643
EOS token: <|im_end|>
EOS token id: 151645


# Load dataset

In [None]:
# For BIRD minidev Dataset
def load_app_dataset(tokenizer, path: str):
    # Load dataset from google drive
    raw_dataset = hf_load_dataset(
        "parquet",
        data_files={"train": path},  # tạo split "train" từ file parquet
    )
    raw_dataset = raw_dataset.shuffle()

    # System message for the assistant
    system_message = """Task Overview:
You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.

Database Engine:
SQLite"""

    def create_conversation(sample):
        user_prompt = f"""Database Schema:
{sample['schema']}
This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.

Question:
{sample['full_question']}

Instructions:
- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
- Do NOT hallucinate: only use tables, columns, and values that exist in the provided schema.
- The generated query should return all of the information asked in the question without any missing or extra information.
- Before generating the final SQL query, please think through the steps of how to write the query.
- Keep the SQL minimal: no extra joins, filters, grouping, ordering, or aliases unless required.
- DO NOT give any preamble or extra characters or markdown, just the SQL query in plain text on a single line with no line breaks or indentation. DO NOT use any code fences or the substring ```sql in the output.
- Take a deep breath and think step by step to find the correct SQL query.

SQL Query:"""

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": sample["SQL"].strip()}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=True
        )
        return {"text": text}


    # Convert dataset to OAI messages
    dataset = raw_dataset.map(create_conversation, batched=False)

    # If you only want to test with a subset
    # dataset["train"] = dataset["train"].select(range(50))

    return dataset

In [None]:
data_path = "/content/drive/MyDrive/Master/Finetune-LLM/text2sql/dataset/bird_train.parquet"

dataset = load_app_dataset(tokenizer, data_path)

# Ví dụ xem thử 1 sample
print(json.dumps(dataset["train"][13]["text"], indent=2, ensure_ascii=False))
print(dataset["train"])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5002 [00:00<?, ? examples/s]

"<|im_start|>system\nTask Overview:\nYou are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.\n\nDatabase Engine:\nSQLite<|im_end|>\n<|im_start|>user\nDatabase Schema:\nCREATE TABLE `bool` (\n    name TEXT,\n    PRIMARY KEY (name)\n);\n\nCREATE TABLE person (\n    name TEXT,\n    PRIMARY KEY (name)\n);\n\nCREATE TABLE disabled (\n    name TEXT,\n    PRIMARY KEY (name),\n    FOREIGN KEY (name) REFERENCES person (name)\n);\n\nCREATE TABLE enlist (\n    name TEXT,\n    organ TEXT, -- organization\n    FOREIGN KEY (name) REFERENCES person (name)\n);\n\nCREATE TABLE filed_for_bankrupcy (\n    name TEXT,\n    PRIMARY KEY (name),\n    FOREIGN KEY (name) REFERENCES person (name)\n);\n\nCREATE TABLE longest_absense_from_school (\n    name TEXT,\n    month INTEGER,\n    PRIMARY KEY (name),\n    FOREIGN KEY (name) REFERENCES person (name)\n);\n\nCREATE

# Load trainer

In [None]:
# Load Lora config if needed
def load_peft_config():
    # Define model init arguments
    model_kwargs = dict(
        attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
        device_map="auto", # Let torch decide how to load the model
    )

    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
        modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
    )
    return peft_config

In [None]:
def load_trainer(model, tokenizer, dataset, use_peft=False):
    logging_steps = 50
    args = SFTConfig(
        output_dir="./output",                  # directory to save and repository id
        num_train_epochs=1,                     # number of training epochs
        per_device_train_batch_size=1,          # batch size per device during training
        gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
        gradient_checkpointing=True,            # use gradient checkpointing to save memory
        optim="adamw_torch_fused",              # use fused adamw optimizer
        logging_steps=logging_steps,            # log every N steps
        save_strategy="epoch",                  # save the weights the end of an epoch
        save_only_model=True,                   # only save the model weights and not the optimizer.pt and other large files
        learning_rate=1e-4,                     # learning rate
        max_grad_norm=0.3,                      # max gradient norm
        warmup_ratio=0.03,                      # warmup ratio
        lr_scheduler_type="constant",           # use constant learning rate scheduler
        dataset_text_field="text",
    )

    # Create Trainer object
    trainer = SFTTrainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        peft_config=load_peft_config() if use_peft else None,
        processing_class=tokenizer
    )
    return trainer

In [None]:
use_peft = False
trainer = load_trainer(model, tokenizer, dataset, use_peft)

Adding EOS to train dataset:   0%|          | 0/5002 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5002 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5002 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mducvpd-19[0m ([33mducvpd-19-ziduck[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.528
100,0.2239
150,0.1572
200,0.1356
250,0.1316
300,0.1285
350,0.1219
400,0.1186
450,0.1276
500,0.1146


TrainOutput(global_step=1251, training_loss=0.13525946152438934, metrics={'train_runtime': 971.201, 'train_samples_per_second': 5.15, 'train_steps_per_second': 1.288, 'total_flos': 1.0470755364765696e+16, 'train_loss': 0.13525946152438934, 'entropy': 0.12073405086994171, 'num_tokens': 3961986.0, 'mean_token_accuracy': 0.9724948406219482, 'epoch': 1.0})

# Push model to hub

In [None]:
from google.colab import userdata
huggingface_key = userdata.get('huggingface_key')

from huggingface_hub import login
login(token=huggingface_key, new_session=False)

In [None]:
# Push to the hub
hub_model_name = "ZiDuck/SFT-Qwen3-0.6B-Text2SQL-MiniBIRD"
model.push_to_hub(hub_model_name)
tokenizer.push_to_hub(hub_model_name)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...wq8dmp3/model.safetensors:   4%|3         | 41.9MB / 1.19GB            

No files have been modified since last commit. Skipping to prevent empty commit.


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpyrzz4tlf/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/ZiDuck/SFT-Qwen3-0.6B-Text2SQL-MiniBIRD/commit/e33181fb2f9d3ec0672da18f1348ce8720b2e2cc', commit_message='Upload tokenizer', commit_description='', oid='e33181fb2f9d3ec0672da18f1348ce8720b2e2cc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ZiDuck/SFT-Qwen3-0.6B-Text2SQL-MiniBIRD', endpoint='https://huggingface.co', repo_type='model', repo_id='ZiDuck/SFT-Qwen3-0.6B-Text2SQL-MiniBIRD'), pr_revision=None, pr_num=None)