In [1]:
# ==============================
# 0) Cài thư viện (Colab)
# ==============================
!pip install -qq --upgrade pip
!pip install -qq --upgrade peft transformers accelerate bitsandbytes datasets trl huggingface_hub evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.8 MB[0m [31m12.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch
import numpy as np
import evaluate
import json

from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model, get_peft_config
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset as hf_load_dataset
from trl import SFTConfig, SFTTrainer
import warnings
from typing import Any, Dict, List

warnings.filterwarnings("ignore")

# Load model

In [3]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [4]:
model_name = "Qwen/Qwen3-0.6B"
model, tokenizer = load_model(model_name)

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [5]:
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    print("Pad token is not set. Setting it to EOS token.")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
else:
    print(f'Pad token: {tokenizer.pad_token}')
    print(f'Pad token id: {tokenizer.pad_token_id}')

print(f'EOS token: {tokenizer.eos_token}')
print(f'EOS token id: {tokenizer.eos_token_id}')

Pad token: <|endoftext|>
Pad token id: 151643
EOS token: <|im_end|>
EOS token id: 151645


# Load dataset

In [6]:
def load_app_dataset(tokenizer, path: str):
    # Load dataset from google drive
    raw_dataset = hf_load_dataset(
        "parquet",
        data_files={"train": path},
    )
    raw_dataset = raw_dataset.shuffle()

    # System message for the assistant
    system_message = """Write a SQL statement that is equivalent to the natural language user query. You are given the schema in the format of a CREATE TABLE SQL statement. Assume the table is called "df". DO NOT give any preamble or extra characters or markdown just the SQL query in plain text. Make sure the SQL query is on one line."""

    def create_conversation(sample):
        user_prompt = f"Schema:\n{sample['schema']}\n\nUser Query:\n#{sample['query']}\n\nSQL Query:\n"
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": sample["sql"].strip()}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        return {"text": text}


    # Convert dataset to OAI messages
    dataset = raw_dataset.map(create_conversation, batched=False)

    # If you only want to test with a subset
    # dataset["train"] = dataset["train"].select(range(50))

    return dataset

In [None]:
data_path = "dataset/parquet_format/single-table_train.parquet"

dataset = load_app_dataset(tokenizer, data_path)

print(json.dumps(dataset["train"][13]["text"], indent=2, ensure_ascii=False))
print(dataset["train"])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

"<|im_start|>system\nWrite a SQL statement that is equivalent to the natural language user query. You are given the schema in the format of a CREATE TABLE SQL statement. Assume the table is called \"df\". DO NOT give any preamble or extra characters or markdown just the SQL query in plain text. Make sure the SQL query is on one line.<|im_end|>\n<|im_start|>user\nSchema:\nCREATE TABLE df (\"Country or region\" text, \"Highest point\" text, \"Maximum elevation\" text, \"Lowest point\" text, \"Minimum elevation\" text, \"Elevation span\" text)\n\nUser Query:\n#What are the highest point in latvia\n\nSQL Query:\n<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nSELECT \"Highest point\" FROM df WHERE \"Country or region\" = 'Latvia'<|im_end|>\n"
Dataset({
    features: ['query', 'schema', 'sql', 'source', 'text'],
    num_rows: 5000
})


# Load trainer

In [None]:
# Load Lora config if needed
def load_peft_config():
    # Define model init arguments
    model_kwargs = dict(
        attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
        device_map="auto", # Let torch decide how to load the model
    )

    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
        modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
    )
    return peft_config

In [17]:
def load_trainer(model, tokenizer, dataset, use_peft=False):
    logging_steps = 50
    args = SFTConfig(
        output_dir="./output",                  # directory to save and repository id
        num_train_epochs=1,                     # number of training epochs
        per_device_train_batch_size=1,          # batch size per device during training
        gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
        gradient_checkpointing=True,            # use gradient checkpointing to save memory
        optim="adamw_torch_fused",              # use fused adamw optimizer
        logging_steps=logging_steps,            # log every N steps
        save_strategy="epoch",                  # save the weights the end of an epoch
        save_only_model=True,                   # only save the model weights and not the optimizer.pt and other large files
        learning_rate=1e-4,                     # learning rate
        max_grad_norm=0.3,                      # max gradient norm
        warmup_ratio=0.03,                      # warmup ratio
        lr_scheduler_type="constant",           # use constant learning rate scheduler
        dataset_text_field="text",
        report_to=None,
    )

    # Create Trainer object
    trainer = SFTTrainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        peft_config=load_peft_config() if use_peft else None,
        processing_class=tokenizer
    )
    return trainer

In [18]:
use_peft = False
trainer = load_trainer(model, tokenizer, dataset, use_peft)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [19]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mducvpd-19[0m ([33mducvpd-19-ziduck[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.951
100,0.7319
150,0.6678
200,0.6387
250,0.6351
300,0.59
350,0.5934
400,0.6181
450,0.5785
500,0.6255


TrainOutput(global_step=1250, training_loss=0.5911774307250977, metrics={'train_runtime': 1014.5729, 'train_samples_per_second': 4.928, 'train_steps_per_second': 1.232, 'total_flos': 2255586271690752.0, 'train_loss': 0.5911774307250977, 'epoch': 1.0})

# Push model to hub

In [20]:
from google.colab import userdata
huggingface_key = userdata.get('huggingface_key')

from huggingface_hub import login
login(token=huggingface_key, new_session=False)

In [21]:
# Push to the hub
hub_model_name = "ZiDuck/SFT-Qwen3-0.6B-Text2SQL-SingleTable"
model.push_to_hub(hub_model_name)
tokenizer.push_to_hub(hub_model_name)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...l__hw1r/model.safetensors:   0%|          |  608kB / 1.19GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpe9s0qxmn/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

CommitInfo(commit_url='https://huggingface.co/ZiDuck/SFT-Qwen3-0.6B-Text2SQL-SingleTable/commit/e25cb8613a6d862bd259414e2d9fc7d711af256b', commit_message='Upload tokenizer', commit_description='', oid='e25cb8613a6d862bd259414e2d9fc7d711af256b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ZiDuck/SFT-Qwen3-0.6B-Text2SQL-SingleTable', endpoint='https://huggingface.co', repo_type='model', repo_id='ZiDuck/SFT-Qwen3-0.6B-Text2SQL-SingleTable'), pr_revision=None, pr_num=None)