# Vietnamese Hate Speech Detection - Training Notebook

This notebook demonstrates how to train a PhoBERT model for hate speech detection on Vietnamese datasets.

**Supported Datasets:**
- ViHSD (Vietnamese Hate Speech Detection)
- ViCTSD (Vietnamese Constructive and Toxic Speech Detection)
- ViHOS (Vietnamese Hate and Offensive Spans)

## 1. Setup and Configuration

In [None]:
# Install required packages (if needed)
# !pip install -r ../requirements.txt

In [None]:
import sys
sys.path.append('../src')

import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load environment variables
load_dotenv()

# Login to Hugging Face
HF_TOKEN = os.getenv("HF_TOKEN", "your_token_here")
login(token=HF_TOKEN)

## 2. Load Dataset

Choose one of the three datasets: ViHSD, ViCTSD, or ViHOS

In [None]:
from data_loader import load_dataset_by_name

# Choose dataset: "ViHSD", "ViCTSD", or "ViHOS"
DATASET_NAME = "ViHSD"

print(f"Loading {DATASET_NAME} dataset...")
train_df, val_df, test_df, metadata = load_dataset_by_name(DATASET_NAME)

print(f"\nDataset: {metadata['name']}")
print(f"Text column: {metadata['text_col']}")
print(f"Label column: {metadata['label_col']}")
print(f"Number of labels: {metadata['num_labels']}")
print(f"\nSplit sizes:")
print(f"  Train: {len(train_df)}")
print(f"  Val: {len(val_df)}")
print(f"  Test: {len(test_df)}")

## 3. Exploratory Data Analysis

In [None]:
import pandas as pd
from collections import Counter

# Display sample data
print("Sample data:")
display(train_df.head())

# Label distribution
all_labels = list(train_df[metadata['label_col']]) + \
             list(val_df[metadata['label_col']]) + \
             list(test_df[metadata['label_col']])

label_counts = Counter(all_labels)
print(f"\nLabel distribution:")
print(dict(sorted(label_counts.items())))

# Text length statistics
all_lengths = [len(str(text)) for text in train_df[metadata['text_col']]] + \
              [len(str(text)) for text in val_df[metadata['text_col']]] + \
              [len(str(text)) for text in test_df[metadata['text_col']]]

print(f"\nText length statistics:")
print(f"  30th percentile: {sorted(all_lengths)[int(len(all_lengths)*0.3)]}")
print(f"  60th percentile: {sorted(all_lengths)[int(len(all_lengths)*0.6)]}")
print(f"  95th percentile: {sorted(all_lengths)[int(len(all_lengths)*0.95)]}")

## 4. Configure Training

In [None]:
from config import TrainConfig

config = TrainConfig(
    dataset_name=DATASET_NAME,
    model_name="vinai/phobert-base",
    max_length=256,
    batch_size=16,
    epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    patience=3,
    seed=42,
)

print("Training Configuration:")
for key, value in config.to_dict().items():
    print(f"  {key}: {value}")

## 5. Build Model and Datasets

In [None]:
from model import build_model
from data_loader import build_torch_dataset
from torch.utils.data import DataLoader

# Build model
model, tokenizer = build_model(
    config.model_name,
    metadata['num_labels'],
    config.device
)

print(f"Model loaded on {config.device}")

# Build datasets
train_dataset = build_torch_dataset(
    train_df, metadata['text_col'], metadata['label_col'],
    tokenizer, config.max_length
)
val_dataset = build_torch_dataset(
    val_df, metadata['text_col'], metadata['label_col'],
    tokenizer, config.max_length
)
test_dataset = build_torch_dataset(
    test_df, metadata['text_col'], metadata['label_col'],
    tokenizer, config.max_length
)

# Build dataloaders
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

print(f"Datasets ready: train={len(train_dataset)}, val={len(val_dataset)}, test={len(test_dataset)}")

## 6. Setup Optimizer and Scheduler

In [None]:
import torch
from transformers import get_cosine_schedule_with_warmup

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.learning_rate,
    weight_decay=config.weight_decay,
)

num_training_steps = len(train_loader) * config.epochs
num_warmup_steps = int(num_training_steps * config.warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

print(f"Total training steps: {num_training_steps}")
print(f"Warmup steps: {num_warmup_steps}")

## 7. Training Loop

In [None]:
import time
from sklearn.metrics import accuracy_score, f1_score, classification_report
from utils import train_epoch, evaluate

best_val_f1 = 0.0
patience_counter = 0
history = {
    "train_loss": [],
    "val_loss": [],
    "val_acc": [],
    "val_f1": [],
    "epoch_seconds": [],
}

print(f"Starting training on {config.device}...\n")
training_start = time.time()

for epoch in range(1, config.epochs + 1):
    epoch_start = time.time()
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, config.device)
    
    # Validate
    val_preds, val_labels, val_loss = evaluate(model, val_loader, config.device)
    val_acc = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average="macro")
    
    epoch_time = time.time() - epoch_start
    
    # Update history
    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)
    history["val_f1"].append(val_f1)
    history["epoch_seconds"].append(epoch_time)
    
    # Print summary
    print(f"Epoch {epoch}/{config.epochs} | Time: {epoch_time:.2f}s")
    print(f"  Train loss: {train_loss:.4f}")
    print(f"  Val loss: {val_loss:.4f} | Val acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")
    
    # Early stopping
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        patience_counter = 0
        print(f"  âœ“ New best F1: {best_val_f1:.4f}. Saving model...")
        model.save_pretrained(config.output_dir)
        tokenizer.save_pretrained(config.output_dir)
    else:
        patience_counter += 1
        print(f"  No improvement ({patience_counter}/{config.patience})")
        if patience_counter >= config.patience:
            print("  Early stopping triggered.")
            break
    print()

training_time = time.time() - training_start
print(f"Training finished in {training_time/60:.2f} minutes.")
print(f"Best validation F1: {best_val_f1:.4f}")

## 8. Test Set Evaluation

In [None]:
from model import load_trained_model

# Load best model
best_model, _ = load_trained_model(str(config.output_dir), config.device)

# Evaluate on test set
test_preds, test_labels, test_loss = evaluate(best_model, test_loader, config.device)
test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average="macro")

print("Test Set Results:")
print(f"  Loss: {test_loss:.4f}")
print(f"  Accuracy: {test_acc:.4f}")
print(f"  Macro F1: {test_f1:.4f}")
print("\nClassification Report:")
print(classification_report(test_labels, test_preds, digits=4))

## 9. Visualize Training History

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(history['train_loss'], label='Train Loss')
axes[0].plot(history['val_loss'], label='Val Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True)

# Metrics plot
axes[1].plot(history['val_acc'], label='Val Accuracy')
axes[1].plot(history['val_f1'], label='Val F1')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Score')
axes[1].set_title('Validation Metrics')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

## 10. Save Results

In [None]:
import pandas as pd
from datetime import datetime

# Save epoch metrics
epoch_df = pd.DataFrame(history)
epoch_df['epoch'] = range(1, len(epoch_df) + 1)
epoch_csv = config.output_dir / "epoch_metrics.csv"
epoch_df.to_csv(epoch_csv, index=False)
print(f"Saved epoch metrics to {epoch_csv}")

# Save summary
summary = {
    'dataset': DATASET_NAME,
    'model': config.model_name,
    'timestamp': datetime.utcnow().isoformat(),
    'best_val_f1': best_val_f1,
    'test_loss': test_loss,
    'test_acc': test_acc,
    'test_f1': test_f1,
    'training_minutes': training_time / 60,
}

summary_df = pd.DataFrame([summary])
summary_csv = config.output_dir / "run_summary.csv"
summary_df.to_csv(summary_csv, index=False)
print(f"Saved run summary to {summary_csv}")

display(summary_df)