In [None]:
# ==============================
# 0) Cài thư viện (Colab)
# ==============================
!pip install -qq --upgrade pip
!pip install -qq --upgrade peft transformers accelerate bitsandbytes datasets trl huggingface_hub evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch
import numpy as np
import evaluate
import json

from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model, get_peft_config
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset as hf_load_dataset
from trl import SFTConfig, SFTTrainer
import warnings
from typing import Any, Dict, List

warnings.filterwarnings("ignore")

# Load model

In [None]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [None]:
model_name = "Qwen/Qwen3-0.6B"
model, tokenizer = load_model(model_name)

In [None]:
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    print("Pad token is not set. Setting it to EOS token.")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
else:
    print(f'Pad token: {tokenizer.pad_token}')
    print(f'Pad token id: {tokenizer.pad_token_id}')

print(f'EOS token: {tokenizer.eos_token}')
print(f'EOS token id: {tokenizer.eos_token_id}')

Pad token: <|endoftext|>
Pad token id: 151643
EOS token: <|im_end|>
EOS token id: 151645


# Load dataset

In [None]:
# For BIRD minidev Dataset
def load_app_dataset(tokenizer, path: str):
    # Load dataset from google drive
    raw_dataset = hf_load_dataset(
        "parquet",
        data_files={"train": path},  # tạo split "train" từ file parquet
    )
    raw_dataset = raw_dataset.shuffle()

    # System message for the assistant
    system_message = """Task Overview:
You are a data science expert. Below, you are provided with a database schema and a natural language question. Your task is to understand the schema and generate a valid SQL query to answer the question.

Database Engine:
SQLite"""

    def create_conversation(sample):
        user_prompt = f"""Database Schema:
{sample['schema']}
This schema describes the database's structure, including tables, columns, primary keys, foreign keys, and any relevant relationships or constraints.

Question:
{sample['full_question']}

Instructions:
- Make sure you only output the information that is asked in the question. If the question asks for a specific column, make sure to only include that column in the SELECT clause, nothing more.
- Do NOT hallucinate: only use tables, columns, and values that exist in the provided schema.
- The generated query should return all of the information asked in the question without any missing or extra information.
- Before generating the final SQL query, please think through the steps of how to write the query.
- Keep the SQL minimal: no extra joins, filters, grouping, ordering, or aliases unless required.
- DO NOT give any preamble or extra characters or markdown, just the SQL query in plain text on a single line with no line breaks or indentation. DO NOT use any code fences or the substring ```sql in the output.
- Take a deep breath and think step by step to find the correct SQL query.

SQL Query:"""

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": sample["SQL"].strip()}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=True
        )
        return {"text": text}


    # Convert dataset to OAI messages
    dataset = raw_dataset.map(create_conversation, batched=False)

    # If you only want to test with a subset
    # dataset["train"] = dataset["train"].select(range(50))

    return dataset

In [None]:
data_path = "/content/drive/MyDrive/Master/Finetune-LLM/text2sql/dataset/bird_train.parquet"

dataset = load_app_dataset(tokenizer, data_path)

print(json.dumps(dataset["train"][13]["text"], indent=2, ensure_ascii=False))
print(dataset["train"])

# Load trainer

In [None]:
# Load Lora config if needed
def load_peft_config():
    # Define model init arguments
    model_kwargs = dict(
        attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
        device_map="auto", # Let torch decide how to load the model
    )

    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
        modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
    )
    return peft_config

In [None]:
def load_trainer(model, tokenizer, dataset, use_peft=False):
    logging_steps = 50
    args = SFTConfig(
        output_dir="./output",                  # directory to save and repository id
        num_train_epochs=1,                     # number of training epochs
        per_device_train_batch_size=1,          # batch size per device during training
        gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
        gradient_checkpointing=True,            # use gradient checkpointing to save memory
        optim="adamw_torch_fused",              # use fused adamw optimizer
        logging_steps=logging_steps,            # log every N steps
        save_strategy="epoch",                  # save the weights the end of an epoch
        save_only_model=True,                   # only save the model weights and not the optimizer.pt and other large files
        learning_rate=1e-4,                     # learning rate
        max_grad_norm=0.3,                      # max gradient norm
        warmup_ratio=0.03,                      # warmup ratio
        lr_scheduler_type="constant",           # use constant learning rate scheduler
        dataset_text_field="text",
    )

    # Create Trainer object
    trainer = SFTTrainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        peft_config=load_peft_config() if use_peft else None,
        processing_class=tokenizer
    )
    return trainer

In [None]:
use_peft = False
trainer = load_trainer(model, tokenizer, dataset, use_peft)

In [None]:
trainer.train()

# Push model to hub

In [None]:
from google.colab import userdata
huggingface_key = userdata.get('huggingface_key')

from huggingface_hub import login
login(token=huggingface_key, new_session=False)

In [None]:
# Push to the hub
hub_model_name = "ZiDuck/SFT-Qwen3-0.6B-Text2SQL-MiniBIRD"
model.push_to_hub(hub_model_name)
tokenizer.push_to_hub(hub_model_name)