In [1]:
!pip install vncorenlp phonlp
!git clone https://github.com/vncorenlp/VnCoreNLP.git
!wget https://github.com/vncorenlp/VnCoreNLP/archive/refs/tags/v1.2.zip
!unzip v1.2.zip
!mkdir phonlp

Collecting vncorenlp
  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting phonlp
  Downloading phonlp-0.3.4.tar.gz (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Collecting gdown>=3.12.2 (from phonlp)
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Building wheels for collected packages: vncorenlp, phonlp
  Building wheel for vncorenlp (setup.py) ... [?25l- done
[?25h  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645932 sha256=54c1f6f63373c2cd1d0c27f427bb08badf19d51ce059be37e928df928f71f6fc
  Stored in directory: /root/.cache/pip/wheels/5d/d9/b3/41f6c6b1

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from vncorenlp import VnCoreNLP
import numpy as np
import os
import time
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import phonlp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import joblib
from tqdm.auto import tqdm
tqdm.pandas()


## Chuẩn Bị Dữ Liệu

In [3]:
class ShopeeCommentsDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_length=256):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, idx):
        comment = str(self.comments[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'comment_text': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [4]:
# Khởi động VnCoreNLP và phoNLP
annotator = VnCoreNLP(os.path.join("/kaggle/working/VnCoreNLP-1.2/VnCoreNLP-1.2.jar"), annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g')
phonlp.download(save_dir='/kaggle/working/phonlp')
phonlp_model= phonlp.load(save_dir='/kaggle/working/phonlp')

Downloading...
From: https://public.vinai.io/phonlp.pt
To: /kaggle/working/phonlp/phonlp.pt
100%|██████████| 582M/582M [00:02<00:00, 225MB/s]
  checkpoint = torch.load(model_file, lambda storage, loc: storage)


Loading model from: /kaggle/working/phonlp/phonlp.pt


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



In [5]:
def preprocess_text(text, annotator, phonlp_model):
    # Tokenize bằng VnCoreNLP
    sentences = annotator.tokenize(text)
    tokens = [word for sentence in sentences for word in sentence]
    text_tokenized = ' '.join(tokens)
    
    # Sử dụng phoNLP để lấy POS tags và NER tags
    phonlp_output = phonlp_model.annotate(text_tokenized)
    
    # Kiểm tra cấu trúc của phonlp_output
    # Đảm bảo rằng phonlp_output là một tuple và phần tử đầu tiên là danh sách các câu
    if isinstance(phonlp_output, tuple) and len(phonlp_output) > 0:
        sentences_phonlp = phonlp_output[0]
    elif isinstance(phonlp_output, list):
        sentences_phonlp = phonlp_output
    else:
        # Xử lý các trường hợp khác nếu cần
        print("Unexpected phonlp_output structure.")
        return ' '.join([f"{token}_O_O" for token in tokens])
    
    pos_tags = []
    ner_tags = []
    
    for sent in sentences_phonlp:
        # Kiểm tra xem sent có phải là dict và chứa khóa 'words'
        if isinstance(sent, dict) and 'words' in sent:
            words = sent['words']
            for word in words:
                if isinstance(word, dict):
                    pos = word.get('pos', 'O')  # Default 'O' nếu không tìm thấy
                    ner = word.get('ner', 'O')
                elif isinstance(word, (list, tuple)):
                    # Giả sử word là một tuple hoặc list: (word, pos, ner)
                    pos = word[1] if len(word) > 1 else 'O'
                    ner = word[2] if len(word) > 2 else 'O'
                else:
                    pos = 'O'
                    ner = 'O'
                pos_tags.append(pos)
                ner_tags.append(ner)
        elif isinstance(sent, (list, tuple)):
            # Nếu sent là list hoặc tuple chứa các từ dưới dạng tuple
            for word in sent:
                if isinstance(word, (list, tuple)):
                    pos = word[1] if len(word) > 1 else 'O'
                    ner = word[2] if len(word) > 2 else 'O'
                else:
                    pos = 'O'
                    ner = 'O'
                pos_tags.append(pos)
                ner_tags.append(ner)
        else:
            # Xử lý các trường hợp khác nếu cần
            print("Unexpected sentence structure.")
            pos_tags.extend(['O'] * len(tokens))
            ner_tags.extend(['O'] * len(tokens))
    
    # Kiểm tra số lượng pos_tags và ner_tags
    if len(pos_tags) != len(tokens) or len(ner_tags) != len(tokens):
        print(f"Warning: Number of POS tags ({len(pos_tags)}), NER tags ({len(ner_tags)}), and tokens ({len(tokens)}) do not match.")
        # Giải pháp: Điều chỉnh số lượng để khớp
        min_length = min(len(pos_tags), len(tokens))
        pos_tags = pos_tags[:min_length]
        ner_tags = ner_tags[:min_length]
        tokens = tokens[:min_length]
    
    # Kết hợp tokens, POS tags, và NER tags
    features = []
    for token, pos, ner in zip(tokens, pos_tags, ner_tags):
        feature = f"{token}_{pos}_{ner}"
        features.append(feature)
    
    return ' '.join(features)


In [None]:
data = pd.read_csv('/kaggle/input/data-dl-tk2/df_final_a.csv')
data = data.dropna().reset_index()
data['cleaned_segmented'] = data['cleaned_old_data'].progress_apply(lambda x: preprocess_text(x, annotator, phonlp_model))

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_segmented'], data['label'], test_size=0.2, random_state=42)

# Chuyển đổi văn bản thành vector TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


## Tạo DataLoaders

In [7]:
# Tải tokenizer và mô hình phoBERT
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Khởi tạo các dataset
train_dataset = ShopeeCommentsDataset(
    comments=X_train.to_numpy(),
    labels=y_train.to_numpy(),
    tokenizer=tokenizer,
    max_length=256
)

test_dataset = ShopeeCommentsDataset(
    comments=X_test.to_numpy(),
    labels=y_test.to_numpy(),
    tokenizer=tokenizer,
    max_length=256
)

# Thiết lập DataLoaders
batch_size = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)




pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Thiết Lập Optimizer và Scheduler

In [8]:
# Khởi tạo optimizer
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)

# Số lượng epoch
epochs = 10
total_steps = len(train_loader) * epochs

# Khởi tạo scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)




## Định Nghĩa Hàm Tính Độ Chính Xác


In [9]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


## Huấn Luyện và Đánh Giá Trong Mỗi Epoch

In [10]:
import time
import datetime

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))
# Khởi tạo các biến theo dõi
best_accuracy = 0
patience = 2
epochs_no_improve = 0

print("Bắt đầu huấn luyện mô hình...")

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    t0 = time.time()
    total_loss = 0

    model.train()

    for step, batch in enumerate(tqdm(train_loader, desc="Iteration")):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()

        loss.backward()

        # Gradient clipping để tránh gradient explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # Đánh giá mô hình sau mỗi epoch
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    eval_accuracy = 0
    eval_loss = 0
    nb_eval_steps = 0

    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

        loss = outputs.loss
        logits = outputs.logits

        eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    avg_val_accuracy = eval_accuracy / nb_eval_steps
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = eval_loss / nb_eval_steps
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Kiểm tra Early Stopping
    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        epochs_no_improve = 0
        # Lưu mô hình tốt nhất
        output_dir = './saved_phobert_model/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        print("  -> Cập nhật mô hình tốt nhất và lưu mô hình.")
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
    else:
        epochs_no_improve += 1
        print(f"  -> Không cải thiện được trong {epochs_no_improve} epoch.")
        if epochs_no_improve >= patience:
            print("Early stopping được kích hoạt.")
            break

print("")
print("Training complete!")


Bắt đầu huấn luyện mô hình...

Training...


Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:01.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:02.
  Batch   200  of  1,778.    Elapsed: 0:02:33.
  Batch   240  of  1,778.    Elapsed: 0:03:04.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:05.
  Batch   360  of  1,778.    Elapsed: 0:04:36.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:08.
  Batch   520  of  1,778.    Elapsed: 0:06:39.
  Batch   560  of  1,778.    Elapsed: 0:07:10.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:11.
  Batch   680  of  1,778.    Elapsed: 0:08:42.
  Batch   720  of  1,778.    Elapsed: 0:09:13.
  Batch   760  of  1,778.    Elapsed: 0:09:44.
  Batch   800  of  1,778.    Elapsed: 0:10:14.
  Batch   840  of  1,778.    Elapsed: 0:10:45.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:02.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:34.
  Batch   240  of  1,778.    Elapsed: 0:03:05.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:37.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:09.
  Batch   520  of  1,778.    Elapsed: 0:06:40.
  Batch   560  of  1,778.    Elapsed: 0:07:10.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:12.
  Batch   680  of  1,778.    Elapsed: 0:08:43.
  Batch   720  of  1,778.    Elapsed: 0:09:13.
  Batch   760  of  1,778.    Elapsed: 0:09:44.
  Batch   800  of  1,778.    Elapsed: 0:10:15.
  Batch   840  of  1,778.    Elapsed: 0:10:46.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:01.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:33.
  Batch   240  of  1,778.    Elapsed: 0:03:04.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:37.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:09.
  Batch   520  of  1,778.    Elapsed: 0:06:40.
  Batch   560  of  1,778.    Elapsed: 0:07:10.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:12.
  Batch   680  of  1,778.    Elapsed: 0:08:42.
  Batch   720  of  1,778.    Elapsed: 0:09:13.
  Batch   760  of  1,778.    Elapsed: 0:09:44.
  Batch   800  of  1,778.    Elapsed: 0:10:15.
  Batch   840  of  1,778.    Elapsed: 0:10:45.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:01.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:33.
  Batch   240  of  1,778.    Elapsed: 0:03:04.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:36.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:09.
  Batch   520  of  1,778.    Elapsed: 0:06:39.
  Batch   560  of  1,778.    Elapsed: 0:07:10.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:11.
  Batch   680  of  1,778.    Elapsed: 0:08:42.
  Batch   720  of  1,778.    Elapsed: 0:09:13.
  Batch   760  of  1,778.    Elapsed: 0:09:43.
  Batch   800  of  1,778.    Elapsed: 0:10:14.
  Batch   840  of  1,778.    Elapsed: 0:10:45.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:01.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:33.
  Batch   240  of  1,778.    Elapsed: 0:03:04.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:36.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:08.
  Batch   520  of  1,778.    Elapsed: 0:06:39.
  Batch   560  of  1,778.    Elapsed: 0:07:10.
  Batch   600  of  1,778.    Elapsed: 0:07:40.
  Batch   640  of  1,778.    Elapsed: 0:08:11.
  Batch   680  of  1,778.    Elapsed: 0:08:42.
  Batch   720  of  1,778.    Elapsed: 0:09:13.
  Batch   760  of  1,778.    Elapsed: 0:09:43.
  Batch   800  of  1,778.    Elapsed: 0:10:14.
  Batch   840  of  1,778.    Elapsed: 0:10:45.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:01.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:34.
  Batch   240  of  1,778.    Elapsed: 0:03:04.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:36.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:08.
  Batch   520  of  1,778.    Elapsed: 0:06:39.
  Batch   560  of  1,778.    Elapsed: 0:07:10.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:11.
  Batch   680  of  1,778.    Elapsed: 0:08:42.
  Batch   720  of  1,778.    Elapsed: 0:09:13.
  Batch   760  of  1,778.    Elapsed: 0:09:44.
  Batch   800  of  1,778.    Elapsed: 0:10:14.
  Batch   840  of  1,778.    Elapsed: 0:10:45.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:01.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:34.
  Batch   240  of  1,778.    Elapsed: 0:03:04.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:37.
  Batch   400  of  1,778.    Elapsed: 0:05:08.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:09.
  Batch   520  of  1,778.    Elapsed: 0:06:40.
  Batch   560  of  1,778.    Elapsed: 0:07:11.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:12.
  Batch   680  of  1,778.    Elapsed: 0:08:43.
  Batch   720  of  1,778.    Elapsed: 0:09:14.
  Batch   760  of  1,778.    Elapsed: 0:09:44.
  Batch   800  of  1,778.    Elapsed: 0:10:15.
  Batch   840  of  1,778.    Elapsed: 0:10:46.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:01.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:34.
  Batch   240  of  1,778.    Elapsed: 0:03:04.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:37.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:09.
  Batch   520  of  1,778.    Elapsed: 0:06:40.
  Batch   560  of  1,778.    Elapsed: 0:07:10.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:12.
  Batch   680  of  1,778.    Elapsed: 0:08:43.
  Batch   720  of  1,778.    Elapsed: 0:09:13.
  Batch   760  of  1,778.    Elapsed: 0:09:44.
  Batch   800  of  1,778.    Elapsed: 0:10:15.
  Batch   840  of  1,778.    Elapsed: 0:10:46.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:02.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:34.
  Batch   240  of  1,778.    Elapsed: 0:03:05.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:37.
  Batch   400  of  1,778.    Elapsed: 0:05:07.
  Batch   440  of  1,778.    Elapsed: 0:05:38.
  Batch   480  of  1,778.    Elapsed: 0:06:09.
  Batch   520  of  1,778.    Elapsed: 0:06:40.
  Batch   560  of  1,778.    Elapsed: 0:07:11.
  Batch   600  of  1,778.    Elapsed: 0:07:41.
  Batch   640  of  1,778.    Elapsed: 0:08:12.
  Batch   680  of  1,778.    Elapsed: 0:08:43.
  Batch   720  of  1,778.    Elapsed: 0:09:14.
  Batch   760  of  1,778.    Elapsed: 0:09:44.
  Batch   800  of  1,778.    Elapsed: 0:10:15.
  Batch   840  of  1,778.    Elapsed: 0:10:46.
  Batch   880

Iteration:   0%|          | 0/1778 [00:00<?, ?it/s]

  Batch    40  of  1,778.    Elapsed: 0:00:31.
  Batch    80  of  1,778.    Elapsed: 0:01:02.
  Batch   120  of  1,778.    Elapsed: 0:01:32.
  Batch   160  of  1,778.    Elapsed: 0:02:03.
  Batch   200  of  1,778.    Elapsed: 0:02:34.
  Batch   240  of  1,778.    Elapsed: 0:03:05.
  Batch   280  of  1,778.    Elapsed: 0:03:35.
  Batch   320  of  1,778.    Elapsed: 0:04:06.
  Batch   360  of  1,778.    Elapsed: 0:04:37.
  Batch   400  of  1,778.    Elapsed: 0:05:08.
  Batch   440  of  1,778.    Elapsed: 0:05:39.
  Batch   480  of  1,778.    Elapsed: 0:06:09.
  Batch   520  of  1,778.    Elapsed: 0:06:40.
  Batch   560  of  1,778.    Elapsed: 0:07:11.
  Batch   600  of  1,778.    Elapsed: 0:07:42.
  Batch   640  of  1,778.    Elapsed: 0:08:12.
  Batch   680  of  1,778.    Elapsed: 0:08:43.
  Batch   720  of  1,778.    Elapsed: 0:09:14.
  Batch   760  of  1,778.    Elapsed: 0:09:45.
  Batch   800  of  1,778.    Elapsed: 0:10:15.
  Batch   840  of  1,778.    Elapsed: 0:10:46.
  Batch   880

In [None]:
# Tải mô hình tốt nhất
output_dir = 'kaggle/working/saved_phobert_model/'
model = AutoModelForSequenceClassification.from_pretrained(output_dir, num_labels=6)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model.to(device)

model.eval()

y_pred = []
y_true = []

for batch in tqdm(test_loader, desc="Evaluation"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    y_pred.extend(predictions.cpu().numpy())
    y_true.extend(labels.cpu().numpy())

print("\nBáo cáo đánh giá:")
print(classification_report(y_true, y_pred, digits=5))


In [None]:
def predict_comment(comment, tokenizer, model, device, max_length=256):
    model.eval()
    
    encoding = tokenizer.encode_plus(
        comment,
        add_special_tokens=True,
        max_length=max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    return predicted_class

# Ví dụ dự đoán
sample_comment = "Sản phẩm rất tốt, giao hàng nhanh và đúng hẹn."
predicted_label = predict_comment(sample_comment, tokenizer, model, device)
print(f"Bình luận: '{sample_comment}' được phân loại vào lớp: {predicted_label}")
