In [7]:
# ============================
# CÀI THƯ VIỆN
# ============================
!pip install torch transformers underthesea tqdm pandas -q

# ============================
# IMPORT THƯ VIỆN
# ============================
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from underthesea import word_tokenize

# ============================
# CẤU HÌNH CHUNG
# ============================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LABELS = ["giai_tri", "luu_tru", "nha_hang", "an_uong", "van_chuyen", "mua_sam"]
BATCH_SIZE = 16
EPOCHS = 5        # Có thể tăng lên nếu muốn train tốt hơn
LR = 2e-5
MAX_LEN = 256
MODEL_PATH = "phobert_sentiment.pt"

# ============================
# TIỀN XỬ LÝ DỮ LIỆU
# ============================
def preprocess_text(text):
    text = str(text).lower().strip()
    text = word_tokenize(text, format="text")
    return text

# ============================
# DATASET
# ============================
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=MAX_LEN):
        self.texts = [preprocess_text(t) for t in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in inputs.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# ============================
# MÔ HÌNH PhoBERT + NN
# ============================
class PhoBertNN(nn.Module):
    def __init__(self, model_name="vinai/phobert-base", hidden_size=768):
        super().__init__()
        self.phobert = AutoModel.from_pretrained(model_name)
        self.nn_head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, len(LABELS))
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        cls_vec = outputs.last_hidden_state[:, 0]
        logits = self.nn_head(cls_vec)
        return logits

# ============================
# HÀM TRAIN / PREDICT
# ============================
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def predict(model, loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Predicting"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            outputs = model(input_ids, attention_mask)
            preds.append(outputs.cpu())
    preds = torch.cat(preds, dim=0)
    preds = torch.clamp(preds, 0, 5)
    return preds.round().int().numpy()

# ============================
# CHƯƠNG TRÌNH CHÍNH
# ============================
def main():
    print("=== NẠP DỮ LIỆU ===")
    train_df = pd.read_csv("train_problem.csv")
    test_df = pd.read_csv("gt_reviews.csv")

    train_texts = train_df["Review"].fillna("").tolist()
    test_texts = test_df["review"].fillna("").tolist()
    y_train = train_df[LABELS].astype(float).values

    print("=== TẢI TOKENIZER PhoBERT ===")
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

    train_ds = TextDataset(train_texts, y_train, tokenizer)
    test_ds = TextDataset(test_texts, None, tokenizer)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    print("=== KHỞI TẠO MÔ HÌNH PhoBERT + NN ===")
    model = PhoBertNN().to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    criterion = nn.MSELoss()

    # ============================
    # LOAD / TRAIN MODEL
    # ============================
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
        print("✅ Đã load trọng số từ phobert_sentiment.pt — bỏ qua bước train.")
    else:
        print("⚠️ Không tìm thấy model cũ — bắt đầu train mới")
        for epoch in range(EPOCHS):
            loss = train_epoch(model, train_loader, optimizer, criterion)
            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss:.4f}")
        torch.save(model.state_dict(), MODEL_PATH)
        print("✅ Đã lưu model vào phobert_sentiment.pt")

    # ============================
    # DỰ ĐOÁN TRÊN TEST
    # ============================
    print("=== DỰ ĐOÁN TRÊN TEST ===")
    preds = predict(model, test_loader)

    result = pd.DataFrame(preds, columns=LABELS)
    result.insert(0, "stt", range(1, len(result) + 1))
    result.to_csv("predictions.csv", index=False)

    print("\n✅ Đã tạo file predictions.csv:")
    print(result.head())

# ============================
# CHẠY
# ============================
if __name__ == "__main__":
    main()


=== NẠP DỮ LIỆU ===
=== TẢI TOKENIZER PhoBERT ===
=== KHỞI TẠO MÔ HÌNH PhoBERT + NN ===
✅ Đã load trọng số từ phobert_sentiment.pt — bỏ qua bước train.
=== DỰ ĐOÁN TRÊN TEST ===


Predicting: 100%|██████████| 63/63 [00:12<00:00,  5.01it/s]


✅ Đã tạo file predictions.csv:
   stt  giai_tri  luu_tru  nha_hang  an_uong  van_chuyen  mua_sam
0    1         4        0         0        0           0        0
1    2         0        1         0        0           0        0
2    3         0        0         1        5           0        0
3    4         0        5         0        1           0        0
4    5         0        1         0        1           0        0



