In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Đọc dữ liệu từ file CSV
data = pd.read_csv("NLP DATASET.csv")  # File chứa câu hỏi và nhãn
# Giả sử file có cột "Question" và "Label"

# Chia dữ liệu thành train và test
X_train, X_test, y_train, y_test = train_test_split(
    data["Question"], data["Label"], test_size=0.2, random_state=42
)


In [10]:
from transformers import AutoTokenizer

# Load PhoBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

# Tokenize dữ liệu
def encode_data(data, tokenizer, max_length=50):
    return tokenizer(
        list(data),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize train và test
train_encodings = encode_data(X_train, tokenizer)
test_encodings = encode_data(X_test, tokenizer)


In [11]:
from sklearn.preprocessing import LabelEncoder

# Encode nhãn
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Chuyển nhãn thành tensor
import torch
train_labels = torch.tensor(y_train_encoded)
test_labels = torch.tensor(y_test_encoded)


In [12]:
from torch.utils.data import Dataset

class QuestionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Tạo Dataset
train_dataset = QuestionDataset(train_encodings, train_labels)
test_dataset = QuestionDataset(test_encodings, test_labels)


In [13]:
from transformers import AutoModelForSequenceClassification

# Load PhoBERT với số nhãn tương ứng
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.utils.data import DataLoader
from transformers import AdamW

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Huấn luyện mô hình
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

model.train()
epochs = 3

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")




Epoch 1, Loss: 1.3389185721675554
Epoch 2, Loss: 0.2869887741903464
Epoch 3, Loss: 0.12609246242791414


In [15]:
from sklearn.metrics import classification_report

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Báo cáo kết quả
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

         How       0.94      0.85      0.89        40
    Quantity       0.96      1.00      0.98        27
        What       0.89      0.89      0.89        27
        When       0.97      0.94      0.96        34
       Where       0.91      1.00      0.95        31
         Who       1.00      1.00      1.00        33
         Why       1.00      1.00      1.00        25
      Yes/No       0.92      0.96      0.94        23

    accuracy                           0.95       240
   macro avg       0.95      0.95      0.95       240
weighted avg       0.95      0.95      0.95       240



In [17]:
def predict_question(question, model, tokenizer, label_encoder):
    model.eval()
    encoding = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=50)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).cpu().numpy()

    return label_encoder.inverse_transform(prediction)[0]

# Ví dụ dự đoán
new_question = "Tôi muốn biết tổng thống Hàn Quốc bây giờ là ai?"
predicted_label = predict_question(new_question, model, tokenizer, label_encoder)
print(f"Câu hỏi: {new_question}")
print(f"Loại câu hỏi dự đoán: {predicted_label}")


Câu hỏi: Tôi muốn biết tổng thống Hàn Quốc bây giờ là ai?
Loại câu hỏi dự đoán: Who
