In [1]:
import os
import gc
import json
import torch
from torch import nn
import numpy as np
from scipy import interpolate
from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot, plot

In [2]:
import transformers

print("Transformers Version:", transformers.__version__)

Transformers Version: 4.51.3


# Classification

**Chuẩn bị dữ liệu**

In [3]:
!git clone https://github.com/duyvuleo/VNTC.git
!ls VNTC/Data/10Topics/Ver1.1

Cloning into 'VNTC'...
remote: Enumerating objects: 39, done.[K
remote: Total 39 (delta 0), reused 0 (delta 0), pack-reused 39 (from 1)[K
Receiving objects: 100% (39/39), 160.90 MiB | 47.14 MiB/s, done.
Resolving deltas: 100% (4/4), done.
Updating files: 100% (15/15), done.
Filtering content: 100% (2/2), 168.95 MiB | 50.06 MiB/s, done.
Stats.txt  Test_Full.rar  Train_Full.rar


In [4]:
%cd /kaggle/working/VNTC/Data/10Topics/Ver1.1
!apt install unrar

/kaggle/working/VNTC/Data/10Topics/Ver1.1



unrar is already the newest version (1:6.1.5-1ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 87 not upgraded.


In [5]:
!unrar x Test_Full.rar /kaggle/working > /dev/null
!unrar x Train_Full.rar /kaggle/working > /dev/null
%cd /kaggle/working
!ls 

/kaggle/working
__notebook__.ipynb  Test_Full  Train_Full  VNTC


**Đọc dữ liệu từ file .txt**

In [6]:
import os
from tqdm import tqdm

train_path = '/kaggle/working/Train_Full'
test_path = '/kaggle/working/Test_Full'
category2id =  {'Kinh doanh': 0,
 'Phap luat': 1,
 'Suc khoe': 2,
 'Doi song': 3,
 'Chinh tri Xa hoi': 4,
 'The gioi': 5,
 'The thao': 6,
 'Vi tinh': 7,
 'Khoa hoc': 8,
 'Van hoa': 9}


In [7]:

def read_file(category, file_path):
    with open(file_path, "r", encoding = 'utf-16') as file:
        content = file.read()
    return (content, category2id[category])

def make_data(root_path):
    data = []
    
    for root, dirs, files in tqdm(os.walk(root_path)):
        for file_name in files:
            try:
                file_path = os.path.join(root, file_name)
                category = root.split(os.path.sep)[-1] 
                content = read_file (category, file_path)
                
                data.append(content)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
    
    return data

data_train = make_data(train_path)
data_test = make_data(test_path) 

11it [00:01,  8.52it/s]
11it [00:02,  5.42it/s]


In [8]:
print(len(data_test), len(data_train))

50373 33759


In [9]:
# Chia tập train thành tập train và valid
from sklearn.model_selection import train_test_split
train_data, valid_data = train_test_split(data_train, test_size = 0.1)

In [10]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

tokenizer = tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
           
class MyDataset(Dataset):
    def __init__(self, data: list, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # [max_length]
            'attention_mask': encoding['attention_mask'].squeeze(0),  # [max_length]
            'labels': torch.tensor(label, dtype=torch.long)
        }


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [11]:
my_train_data = MyDataset(train_data)
my_valid_data = MyDataset(valid_data)
my_test_data = MyDataset(data_test)

In [12]:
train_dataloader = torch.utils.data.DataLoader(dataset = my_train_data, batch_size = 32,  shuffle = True)
valid_dataloader = torch.utils.data.DataLoader(dataset = my_valid_data, batch_size = 32)
test_dataloader = torch.utils.data.DataLoader(dataset = my_test_data, batch_size = 32)

In [13]:
 for data in train_dataloader:
     print(data)
     break

{'input_ids': tensor([[    0,   125, 10969,  ...,  2284,   244,     2],
        [    0,  6912,   985,  ...,  2115,    49,     2],
        [    0,  8917,  2665,  ...,    13,   996,     2],
        ...,
        [    0,  4449,  5298,  ...,  1187,     4,     2],
        [    0,  3436,  7923,  ...,     1,     1,     1],
        [    0, 34577, 55528,  ...,  2537,  5237,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([4, 3, 2, 7, 3, 7, 3, 5, 4, 1, 4, 7, 6, 0, 4, 6, 2, 1, 1, 9, 6, 3, 5, 7,
        0, 4, 4, 1, 4, 0, 4, 4])}


# Load model PhoBERT

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [15]:
from transformers import AutoModel

class PhoBert_Classifier(nn.Module):

    def __init__(self, freeze_bert=False, num_classes=10, drop=0.3):
        super(PhoBert_Classifier, self).__init__()

        self.num_classes = num_classes
        self.model_name = 'phobert'
        self.bert = AutoModel.from_pretrained('vinai/phobert-base')
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        self.drop = nn.Dropout(drop)
        self.fc = nn.Linear(self.bert.config.hidden_size, self.num_classes)
        # nn.init.normal_(self.fc.weight, std=0.02)
        # nn.init.normal_(self.fc.bias, 0)
        
    def forward(self, input_ids, attention_mask):
        last_hidden_state, output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        x = self.drop(output)
        x = self.fc(x)
        return x
    


# Train

In [16]:
import os
import gc
import json
import torch
from torch import nn
import numpy as np
from scipy import interpolate
from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot, plot

def train_step(model, criterion, optimizer, train_dataloader):
    model.train()
    losses = []
    correct = 0
    
    trues = []
    predicts = []

    for data in tqdm(train_dataloader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(outputs, labels)
        pred = torch.max(outputs, dim=1)[1]

        correct += torch.sum(pred == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()
        
        trues.extend(labels.cpu().detach().numpy())
        predicts.extend(pred.cpu().detach().numpy())

    accuracy = correct.double().cpu().data.numpy()/len(train_dataloader.dataset)
    loss = np.mean(losses)

    return accuracy, f1_score(trues, predicts, average='macro'), loss


def validation_step(model, criterion, dataloader):
    model.eval()
    losses = []
    correct = 0
    
    trues = []
    predicts = []
    
    with torch.no_grad():
        for data in tqdm(dataloader):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pred = torch.max(outputs, dim=1)[1]
            
            loss = criterion(outputs, labels)
            correct += torch.sum(pred == labels)
            losses.append(loss.item())
            
            trues.extend(labels.cpu().detach().numpy())
            predicts.extend(pred.cpu().detach().numpy())
            
    accuracy = correct.double().cpu().data.numpy()/len(dataloader.dataset)
    loss = np.mean(losses)

    return accuracy, f1_score(trues, predicts, average='macro'), loss


def train(model, criterion, optimizer, train_dataloader, val_dataloader, epochs, early_stopping):
    torch.cuda.empty_cache()
    gc.collect()
    
    Path(model.model_name).mkdir(parents=True, exist_ok=True)
    
    if early_stopping:
        path_checkpoint = os.path.join(model.model_name, f"{model.model_name}_checkpoint.pth")
        early_stopping.path = path_checkpoint
    
    best_f1 = 0
    best_model_path = os.path.join(model.model_name, f"{model.model_name}_best_model.pth")
    last_model_path = os.path.join(model.model_name, f"{model.model_name}_last_model.pth")
    history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}

    for epoch in range(epochs):
        print("Epoch {}/{}".format(epoch+1, epochs))
        print("-"*50)
        
        train_accuracy, train_f1, train_loss = train_step(model, criterion, optimizer, train_dataloader)
        val_accuracy, val_f1, val_loss = validation_step(model, criterion, val_dataloader)
        
        history['train_acc'].append(train_accuracy)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_accuracy)
        history['val_loss'].append(val_loss)
        
        print("Train Accuracy: {:.4f} - Train F1-score: {:.4f} - Train Loss: {:.4f}".format(train_accuracy, train_f1, train_loss))
        print("Valid Accuracy: {:.4f} - Valid F1-score: {:.4f} - Valid Loss: {:.4f}".format(val_accuracy, val_f1, val_loss))
        
        if val_f1 > best_f1:
            torch.save(model.state_dict(), best_model_path)
            best_f1 = val_f1
        
        if epoch + 1 == epochs:
            torch.save(model.state_dict(), last_model_path)
            break
            
        if early_stopping:
            early_stopping(val_loss, model)
            if early_stopping.early_stop:
                torch.save(model.state_dict(), last_model_path)
                print("Early stopping.")
                break
    
    history_path = os.path.join(model.model_name, f"{model.model_name}_history.json")
    with open(history_path, 'w') as f:
        json.dump(history, f)
        
    # with open(history_path) as f:
    #     history = json.load(f)
    
    return history


def test(model, dataloader):
    model.eval()
    predicts = []
    predict_probs = []
    true_labels = []
    
    for data in tqdm(dataloader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
        pred = torch.max(outputs, dim=1)[1]
        predicts.extend(pred.cpu().data.numpy())
        predict_probs.extend(outputs.cpu().data.numpy())
        true_labels.extend(labels.cpu().data.numpy())
        
    return true_labels, predicts


def evaluate(true_labels, predicts):
    test_accuracy = accuracy_score(true_labels, predicts)
    test_precision = precision_score(true_labels, predicts, average='macro')
    test_recall = recall_score(true_labels, predicts, average='macro')
    test_f1 = f1_score(true_labels, predicts, average='macro')
    test_cm = confusion_matrix(true_labels, predicts)
    
    print("Accuracy: {:.4f}".format(test_accuracy))
    print("Precision: {:.4f}".format(test_precision))
    print("Recall: {:.4f}".format(test_recall))
    print("F1-score: {:.4f}".format(test_f1))
    print("Confusion matrix:\n", test_cm)
    print(classification_report(true_labels, predicts, digits=4))
    
    return test_cm


def save_visualization_history(history, model_name):
    if len(history['train_loss']) <= 1:
        return False
    
    colors = plt.get_cmap('tab10').colors
    epochs = np.arange(1, len(history['train_loss']) + 1)
    xnew = np.linspace(epochs[0], epochs[-1], 300)
    
    train_loss_smooth = interpolate.interp1d(epochs, history['train_loss'], kind='linear')(xnew)
    val_loss_smooth = interpolate.interp1d(epochs, history['val_loss'], kind='linear')(xnew)

    fig, ax = plt.subplots()
    ax.plot(xnew, train_loss_smooth, color=colors[0], linewidth=3, label='Training Loss')
    ax.plot(xnew, val_loss_smooth, color=colors[1], linewidth=3, label='Validation Loss')
    ax.set_title('Training and Validation Loss', fontsize=12)
    ax.set_xlabel('Epoch', fontsize=10, labelpad=10)
    ax.set_ylabel('Loss', fontsize=10, labelpad=10)
    ax.tick_params(axis='both', which='both', length=0)
    ax.grid(axis='y')
    for pos in ['right', 'top', 'left', 'bottom']:
        ax.spines[pos].set_visible(False)
    ax.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), frameon=False, prop={'size': 10})
    file_name = os.path.join(model_name, f"{model_name}_loss_visualization.png")
    fig.savefig(file_name, dpi=300, bbox_inches='tight')
    plt.close(fig)
    
    train_acc_smooth = interpolate.interp1d(epochs, history['train_acc'], kind='linear')(xnew)
    val_acc_smooth = interpolate.interp1d(epochs, history['val_acc'], kind='linear')(xnew)

    fig, ax = plt.subplots()
    ax.plot(xnew, train_acc_smooth, color=colors[2], linewidth=3, label='Training Accuracy')
    ax.plot(xnew, val_acc_smooth, color=colors[4], linewidth=3, label='Validation Accuracy')
    ax.set_title('Training and Validation Accuracy', fontsize=12)
    ax.set_xlabel('Epoch', fontsize=10, labelpad=10)
    ax.set_ylabel('Accuracy', fontsize=10, labelpad=10)
    ax.tick_params(axis='both', which='both', length=0)
    ax.grid(axis='y')
    for pos in ['right', 'top', 'left', 'bottom']:
        ax.spines[pos].set_visible(False)
    ax.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), frameon=False, prop={'size': 10})
    file_name = os.path.join(model_name, f"{model_name}_accuracy_visualization.png")
    fig.savefig(file_name, dpi=300, bbox_inches='tight')
    plt.close(fig)
    
    

In [17]:

class EarlyStopping:
    
    def __init__(self, patience=5, verbose=True, delta=0, path='checkpoint.pth'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        
    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0
            
    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
            
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss    

In [18]:
train_df = pd.DataFrame(my_train_data.data, columns=['text', 'label'])

In [19]:
from sklearn.utils import class_weight

labels = train_df['label']
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [20]:
from transformers import get_linear_schedule_with_warmup
model_name = 'phobert'
model = PhoBert_Classifier().to(device)
optimizer =torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5) # 5e-5, 2e-5, 3e-5
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device), reduction='mean')
epochs = 10
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs)
early_stopping = EarlyStopping(patience=3, delta=0)

2025-07-05 02:30:34.627920: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751682634.818261      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751682634.875844      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [21]:
history = train(model, criterion, optimizer, train_dataloader, valid_dataloader, epochs, early_stopping)

Epoch 1/10
--------------------------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/106 [00:00<?, ?it/s]

Train Accuracy: 0.8655 - Train F1-score: 0.8587 - Train Loss: 0.5407
Valid Accuracy: 0.9028 - Valid F1-score: 0.9003 - Valid Loss: 0.3019
Validation loss decreased (inf --> 0.301934). Saving model ...
Epoch 2/10
--------------------------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/106 [00:00<?, ?it/s]

Train Accuracy: 0.9312 - Train F1-score: 0.9291 - Train Loss: 0.2277
Valid Accuracy: 0.9200 - Valid F1-score: 0.9195 - Valid Loss: 0.2475
Validation loss decreased (0.301934 --> 0.247489). Saving model ...
Epoch 3/10
--------------------------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/106 [00:00<?, ?it/s]

Train Accuracy: 0.9528 - Train F1-score: 0.9516 - Train Loss: 0.1548
Valid Accuracy: 0.9212 - Valid F1-score: 0.9201 - Valid Loss: 0.2845
EarlyStopping counter: 1 out of 3
Epoch 4/10
--------------------------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/106 [00:00<?, ?it/s]

Train Accuracy: 0.9706 - Train F1-score: 0.9706 - Train Loss: 0.1014
Valid Accuracy: 0.9212 - Valid F1-score: 0.9214 - Valid Loss: 0.3198
EarlyStopping counter: 2 out of 3
Epoch 5/10
--------------------------------------------------


  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/106 [00:00<?, ?it/s]

Train Accuracy: 0.9812 - Train F1-score: 0.9808 - Train Loss: 0.0692
Valid Accuracy: 0.9262 - Valid F1-score: 0.9252 - Valid Loss: 0.3252
EarlyStopping counter: 3 out of 3
Early stopping.


# Test

In [22]:
model_path = os.path.join(model.model_name, f"{model.model_name}_last_model.pth")
model = PhoBert_Classifier().to(device)
model.load_state_dict(torch.load(model_path))
true_labels, predicts = test(model, test_dataloader)
test_cm = evaluate(true_labels, predicts)

  0%|          | 0/1575 [00:00<?, ?it/s]

Accuracy: 0.9023
Precision: 0.8850
Recall: 0.8635
F1-score: 0.8656
Confusion matrix:
 [[4676   36   17   16  293   49    6  136   32   15]
 [  42 3403    4    5  274    3   26    9    0   22]
 [  11    6 5093   50   63   23    6    6  150    9]
 [  34   25  118  738  370   98   27   41  198  387]
 [ 239  159  141   43 6628   45   32   74   89  117]
 [ 112   16   61   14   75 6164   47   60   88   79]
 [   3   13    3    7   21    6 6577    7    6   24]
 [  46   13   10    7   42   13    6 4352   56   15]
 [  21    1   94   36   67   26    5   42 1772   32]
 [   6   10    5   16   80   36   15    8   27 6047]]
              precision    recall  f1-score   support

           0     0.9010    0.8863    0.8936      5276
           1     0.9242    0.8984    0.9111      3788
           2     0.9183    0.9402    0.9291      5417
           3     0.7918    0.3625    0.4973      2036
           4     0.8376    0.8759    0.8563      7567
           5     0.9537    0.9178    0.9354      6716
    

In [23]:
model_path = os.path.join(model_name, f"{model_name}_best_model.pth")
model = PhoBert_Classifier().to(device)
model.load_state_dict(torch.load(model_path))
true_labels, predicts = test(model, test_dataloader)
test_cm = evaluate(true_labels, predicts)

  0%|          | 0/1575 [00:00<?, ?it/s]

Accuracy: 0.9023
Precision: 0.8850
Recall: 0.8635
F1-score: 0.8656
Confusion matrix:
 [[4676   36   17   16  293   49    6  136   32   15]
 [  42 3403    4    5  274    3   26    9    0   22]
 [  11    6 5093   50   63   23    6    6  150    9]
 [  34   25  118  738  370   98   27   41  198  387]
 [ 239  159  141   43 6628   45   32   74   89  117]
 [ 112   16   61   14   75 6164   47   60   88   79]
 [   3   13    3    7   21    6 6577    7    6   24]
 [  46   13   10    7   42   13    6 4352   56   15]
 [  21    1   94   36   67   26    5   42 1772   32]
 [   6   10    5   16   80   36   15    8   27 6047]]
              precision    recall  f1-score   support

           0     0.9010    0.8863    0.8936      5276
           1     0.9242    0.8984    0.9111      3788
           2     0.9183    0.9402    0.9291      5417
           3     0.7918    0.3625    0.4973      2036
           4     0.8376    0.8759    0.8563      7567
           5     0.9537    0.9178    0.9354      6716
    