In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [ ]:
from huggingface_hub import login
import os

login(token=os.getenv('HF_TOKEN', 'your_huggingface_token_here'))

In [None]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "thailevann/Qwen3-1.7B_CT_VLSP_track5",
    max_seq_length = 8192,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-06 02:23:50.454187: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751768630.479926     223 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751768630.487646     223 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!




==((====))==  Unsloth 2025.6.12: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 6.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.6.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
!gdown 1OnxJ_UeJ_YXRX0E1U7lBIxqI9phaI6wq

In [None]:
!gdown 1GatkZT0nepRMC0G2lUxofP_9yKThwVlC

In [None]:
from datasets import Dataset
import json

# Bước 1: Load legal_corpus.json và tạo map aid -> (law_id, content)
aid2info = {}

with open('legal_corpus.json', 'r', encoding='utf-8') as f:
    corpus = json.load(f)

for doc in corpus:
    law_id = doc['law_id']
    for article in doc['content']:
        aid = article['aid']
        content = article['content_Article']
        aid2info[aid] = (law_id, content)

# Bước 2: Load train.json và format lại dữ liệu
instruction_output_list = []

with open('train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

for item in train_data:
    question = item['question']
    relevant_laws = item['relevant_laws']

    output_parts = []
    for idx, aid in enumerate(relevant_laws, start=1):
        law_info = aid2info.get(aid)
        if law_info:
            law_id, content = law_info
            output_parts.append(f"Luật liên quan {idx}: {law_id}\n{content}")
        else:
            output_parts.append(f"Luật liên quan {idx}: [Không tìm thấy aid {aid}]")

    instruction_output_list.append({
        "instruction": question,
        "output": "\n\n".join(output_parts)
    })

# Bước 3: Tạo dataset
dataset = Dataset.from_list(instruction_output_list)


In [None]:
dataset

Dataset({
    features: ['instruction', 'output'],
    num_rows: 2190
})

In [None]:
# Lọc những mẫu KHÔNG có lý do (reason_classification rỗng hoặc None)
dataset_without_reasoning = dataset

In [None]:
def convert_conversations_to_chat_format_non_reasoning(examples):
    question = examples.get("instruction", "").strip()
    answer = examples.get("output", "").strip()

    # Bỏ nếu thiếu nội dung
    if not question or not answer:
        return {"conversation": []}

    # Prompt rõ ràng, tự nhiên
    user_prompt = f"""Bạn là một trợ lý AI trong lĩnh vực pháp luật. Vui lòng trích dẫn các điều luật liên quan đến câu hỏi.

    ## Câu hỏi:
    {question}
    """

    chat_conversations = [
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": answer}
    ]

    return {"conversation": chat_conversations}


In [None]:
from unsloth.chat_templates import standardize_sharegpt
from datasets import load_dataset, Dataset

converted_data_non_reasoning = [convert_conversations_to_chat_format_non_reasoning(data) for data in dataset_without_reasoning]
dataset_without_reasoning = Dataset.from_list(converted_data_non_reasoning )
dataset_without_reasoning = standardize_sharegpt(dataset_without_reasoning)


In [None]:
non_reasoning_conversations = tokenizer.apply_chat_template(
    dataset_without_reasoning["conversation"],
    tokenize = False,
)


In [None]:
print(len(non_reasoning_conversations))

2190


In [None]:
import pandas as pd
non_reasoning_subset = pd.Series(non_reasoning_conversations)

In [None]:
import pandas as pd
'''
data = pd.concat([
    pd.Series(reasoning_conversations),
    pd.Series(non_reasoning_subset)
])
'''
#data = pd.Series(reasoning_conversations)
data = pd.Series(non_reasoning_subset)

data.name = "text"

from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)
#combined_dataset = combined_dataset.remove_columns("__index_level_0__")


In [None]:
from trl import SFTConfig, SFTTrainer


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=combined_dataset,
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        warmup_steps=50,
        learning_rate=2e-5,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="./checkpoints",
        save_total_limit=2,
        fp16=True,
    ),

)



Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/2190 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

model.push_to_hub("thailevann/Qwen3-1.7B_SFT_VLSP_track5")
tokenizer.push_to_hub("thailevann/Qwen3-1.7B_SFT_VLSP_track5")