# Load the datasets

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from trl import SFTTrainer



In [2]:
dataset = load_dataset("bkai-foundation-models/vi-alpaca")
dataset

Generating train split:   0%|          | 0/50006 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 50006
    })
})

In [3]:
# remove samples without output field
dataset = dataset.filter(lambda x: x["output"] != "")

Filter:   0%|          | 0/50006 [00:00<?, ? examples/s]

In [4]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 47504
})
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 2501
})


# Load the model

In [5]:
model_name = "chronopt-research/vietnamese-gpt2-medium"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)
tokenizer.pad_token = "<pad>"
model.config.pad_token_id = tokenizer.convert_tokens_to_ids("<pad>")
tokenizer.model_max_length = model.config.n_positions
model.resize_token_embeddings(len(tokenizer))

Invalid model-index. Not loading eval results into CardData.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 1024)

In [6]:
tokenizer.pad_token_id

1

In [7]:
tokenizer.eos_token_id

50257

In [8]:
len(tokenizer)

50258

In [9]:
# Total parameters and trainable parameters.
print(model)
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50258, bias=False)
)
354,824,192 total parameters.
354,824,192 

# Preprocessing

In [10]:
def preprocess_function(example):
    """
    Formatting function with clear delimiters and handling of empty inputs.
    """
    instruction = example['instruction'].strip()
    # Handle empty or None input gracefully
    input_text = example['input'].strip() if example['input'] else ""
    output = example['output'].strip()
    
    # Format with clear separators
    if input_text:
        text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}\n\n"
    else:
        text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}\n\n"
    return text

In [11]:
# data collator for causal LM
from trl import DataCollatorForCompletionOnlyLM

response_template = "### Response:\n"
data_collator = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer,
    response_template=response_template,
)

# Training

In [12]:
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    # Paths & Datasets
    output_dir="gpt2-vietnamese-medium-instruct-bf16",    
    logging_dir="logs",                  
    
    # Truncation / Packing
    max_length=tokenizer.model_max_length,
    
    per_device_train_batch_size=3,       
    per_device_eval_batch_size=3,
    gradient_accumulation_steps=4,

    # Optimization & LR Scheduling
    learning_rate=2e-5,
    weight_decay=0.03,
    num_train_epochs=10,
    warmup_steps=500,
    lr_scheduler_type="cosine",

    # Evaluation / Checkpoint
    eval_strategy="steps",              
    save_strategy="steps",              
    logging_strategy="steps",           
    eval_steps=1000,
    save_steps=1000,
    logging_steps=1000,
    save_total_limit=2,

    # Best‑model selection
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [None]:
trainer = SFTTrainer(
    model=model,                         
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    args=sft_config,                     
    processing_class=tokenizer,
    formatting_func=preprocess_function,
    data_collator=data_collator,
)

Applying formatting function to train dataset:   0%|          | 0/47504 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/47504 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/47504 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/47504 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 1024). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/47504 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/2501 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/2501 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/2501 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2501 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2501 [00:00<?, ? examples/s]

In [None]:
history = trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


In [None]:
torch.cuda.empty_cache()

In [None]:
# push to hub
trainer.push_to_hub("binhphap5/gpt2-vietnamese-medium-instruct-bf16")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/710M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/binhphap5/gpt-medium-instruct/commit/a89d09c0444ed0846d75d4ca6086132923a29b6e', commit_message='binhphap5/gpt2-vietnamese-medium-instruct-bf16', commit_description='', oid='a89d09c0444ed0846d75d4ca6086132923a29b6e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/binhphap5/gpt-medium-instruct', endpoint='https://huggingface.co', repo_type='model', repo_id='binhphap5/gpt-medium-instruct'), pr_revision=None, pr_num=None)

In [None]:
template = """### Instruction:
{}

### Input:
{}

### Response:
{}"""

instructions = 'Trình bày từng bước để học tiếng Anh tốt.'
inputs = ''
response = ''
prompt = template.format(instructions, inputs, response)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

### Instruction:
Trình bày từng bước để học tiếng Anh tốt.

### Input:


### Response:
Để học tiếng Anh tốt, bạn có thể tuân theo các bước sau:

1. Đọc sách và nghe trực tuyến với những từ mới.
2. Nghe tin tức trên mạng xã hội và gặp gỡ bạn bè mới.
3. Tìm hiểu về các ngôn ngữ, văn hóa và hoạt động giao tiếp trong cộng đồng.
4. Chia sẻ kiến thức và kinh nghiệm cá nhân.
5. Đọc sách, tài liệu và tài liệu tham khảo khác.
6. Luyện viết và viết tiếng Anh thông qua việc đọc sách.
7. Thực hành tiếng Anh với các bài tập và bài tập thực tế.
8. Nâng cao kỹ năng viết và phát triển khả năng tư duy logic.
9. Luyện nói bằng cách sử dụng các phương pháp như viết, viết mạch lạc, hoặc sử dụng các công cụ như Python.
10. Nâng cao kiến thức và kỹ năng giao tiếp tiếng Anh.

<|endoftext|>
