In [1]:
from transformers import AutoModel, AutoConfig, AutoTokenizer
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import math as m

In [2]:
config = AutoConfig.from_pretrained('vinai/phobert-base')
phobert_model = AutoModel.from_pretrained('vinai/phobert-base', config=config)
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
data = np.load('./0.VanHoaFull/encoded_texts.npz')
X = data['X']
Y = data['Y']

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [5]:
# Chuyển đổi dữ liệu thành Tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.long)

In [6]:
# Tạo DataLoader
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32)

In [7]:
class PhoBertForTextGeneration(nn.Module):
    def __init__(self, phobert_model):
        super(PhoBertForTextGeneration, self).__init__()
        self.phobert = phobert_model
        self.config = phobert_model.config  # Lưu cấu hình từ mô hình gốc
        self.linear = nn.Linear(self.config.hidden_size, self.config.vocab_size)
    
    def forward(self, input_ids, attention_mask=None):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        prediction_scores = self.linear(sequence_output)
        return prediction_scores

In [8]:
# Sử dụng GPU nếu có
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = model.to(device)
# model = PhoBertForTextGeneration(phobert_model, config)
# vocab_size = tokenizer.vocab_size
# model = PhoBertForTextGeneration(phobert_model, vocab_size).to(device)
model = PhoBertForTextGeneration(phobert_model).to(device)

In [10]:
# Định nghĩa Optimizer
optimizer = Adam(model.parameters(), lr=5e-5)

In [11]:
# Hàm huấn luyện
def train(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        # Chuyển dữ liệu sang device phù hợp
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        # Xóa gradient cũ
        optimizer.zero_grad()

        # Tính toán đầu ra của mô hình
        outputs = model(input_ids=input_ids)
        # Shape của outputs là [batch_size, seq_len, vocab_size]

        # Tính toán độ lỗi
        loss = CrossEntropyLoss()(outputs.view(-1, model.config.vocab_size), labels.view(-1))

        # Backpropagation
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Trả về độ lỗi trung bình
    return total_loss / len(data_loader)

In [12]:
num_epochs = 30

# Huấn luyện mô hình
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1} ", end="\t")
    avg_loss = train(model, train_loader, optimizer)
    print(f"Loss: {avg_loss:.2f}")

Epoch 1 	Loss: 9.08
Epoch 2 	Loss: 6.29
Epoch 3 	Loss: 4.92
Epoch 4 	Loss: 3.94
Epoch 5 	Loss: 3.20
Epoch 6 	Loss: 2.64
Epoch 7 	Loss: 2.23
Epoch 8 	Loss: 1.91
Epoch 9 	Loss: 1.66
Epoch 10 	Loss: 1.45
Epoch 11 	Loss: 1.28
Epoch 12 	Loss: 1.14
Epoch 13 	Loss: 1.02
Epoch 14 	Loss: 0.92
Epoch 15 	Loss: 0.83
Epoch 16 	Loss: 0.76
Epoch 17 	Loss: 0.69
Epoch 18 	Loss: 0.63
Epoch 19 	Loss: 0.57
Epoch 20 	Loss: 0.52
Epoch 21 	Loss: 0.48
Epoch 22 	Loss: 0.44
Epoch 23 	Loss: 0.40
Epoch 24 	Loss: 0.37
Epoch 25 	Loss: 0.34
Epoch 26 	Loss: 0.31
Epoch 27 	Loss: 0.28
Epoch 28 	Loss: 0.26
Epoch 29 	Loss: 0.24
Epoch 30 	Loss: 0.22


In [13]:
torch.save(model.state_dict(), 'phobert_text_generation_model.pth')

In [14]:
model = PhoBertForTextGeneration(phobert_model)

# Tải trạng thái mô hình
model.load_state_dict(torch.load('phobert_text_generation_model.pth'))
model = model.to(device)

In [15]:
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids, labels = batch[0].to(device), batch[1].to(device)
            outputs = model(input_ids=input_ids)
            loss = CrossEntropyLoss()(outputs.view(-1, model.config.vocab_size), labels.view(-1))
            total_loss += loss.item()
    return total_loss / len(data_loader)

In [18]:
# Tính độ lỗi trung bình trên tập kiểm thử
avg_test_loss = evaluate(model, test_loader)
perplexity = m.exp(avg_test_loss)
print(f"Perplexity: {perplexity}")

Perplexity: 1.5260931600534433


In [None]:
# Hàm huấn luyện
def train(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids, labels = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids)
        loss = CrossEntropyLoss()(outputs.view(-1, model.config.vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)