In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 1. Đọc dữ liệu từ file CSV
def read_data(file_path):
    df = pd.read_csv(file_path)
    texts = df['Review'].tolist()
    labels = [1 if sentiment == 'Tích cực' else 0 for sentiment in df['Sentiment']]
    return texts, labels

# 2. Tạo Dataset (giữ nguyên như trước)
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
df_tiengviet = pd.read_csv('/content/drive/MyDrive/Gemini_Thi_AI/Demo_TiengViet/tonghop_tiengviet_review.csv')
df_tiengviet

In [None]:
# 3. Huấn luyện mô hình 

def train_model(model, train_dataloader, val_dataloader, device, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    train_accuracies = []
    val_accuracies = []
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_preds, train_labels_list = [], []
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            train_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1)
            train_preds.extend(preds.cpu().numpy())
            train_labels_list.extend(labels.cpu().numpy())
            
            loss.backward()
            optimizer.step()
        
        train_accuracy = accuracy_score(train_labels_list, train_preds)
        train_accuracies.append(train_accuracy)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_dataloader)}, Train Accuracy: {train_accuracy}")
        
        # Đánh giá trên tập validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
        
        val_accuracy = accuracy_score(val_labels, val_preds)
        val_accuracies.append(val_accuracy)
        print(f"Validation Accuracy: {val_accuracy}")
    
    return train_accuracies, val_accuracies, val_labels, val_preds

def plot_accuracy(train_accuracies, val_accuracies):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Train Accuracy')
    plt.plot(range(1, len(val_accuracies) + 1), val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Train vs Validation Accuracy')
    plt.legend()
    plt.savefig('accuracy_plot.png')
    plt.close()

def plot_confusion_matrix(true_labels, pred_labels):
    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()

def plot_roc_curve(true_labels, pred_probs):
    fpr, tpr, _ = roc_curve(true_labels, pred_probs)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.savefig('roc_curve.png')
    plt.close()

def main():
    # Đường dẫn đến file CSV
    file_path = '/content/drive/MyDrive/Gemini_Thi_AI/Demo_TiengViet/tonghop_tiengviet_review.csv'
    
    # Đọc dữ liệu
    texts, labels = read_data(file_path)
    
    # Chia dữ liệu thành tập train và test
    train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
    
    # Chuẩn bị mô hình và tokenizer
    model_name = 'vinai/phobert-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    # Chuẩn bị dataset và dataloader
    max_length = 128
    train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_length)
    test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, max_length)
    
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=16)
    
    # Huấn luyện mô hình
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    train_accuracies, val_accuracies, val_labels, val_preds = train_model(model, train_dataloader, test_dataloader, device, epochs=3)
    
    # Vẽ biểu đồ
    plot_accuracy(train_accuracies, val_accuracies)
    plot_confusion_matrix(val_labels, val_preds)
    
    # Tính toán xác suất dự đoán cho ROC curve
    model.eval()
    val_probs = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs.logits, dim=1)[:, 1]  # Lấy xác suất cho lớp tích cực
            val_probs.extend(probs.cpu().numpy())
    
    plot_roc_curve(val_labels, val_probs)
    
    # Lưu mô hình
    model.save_pretrained('/content/drive/MyDrive/Gemini_Thi_AI/Demo_TiengViet/model_tiengviet')

if __name__ == '__main__':
    main()