In [3]:
from PhoBERT import PhoBERTTextPreprocessor, VietnameseNewsDataset, PhoBERTClassifier, PhoBERTTrainer
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import DataCollatorWithPadding, get_scheduler

In [None]:
# Ki·ªÉm tra GPU
if not torch.cuda.is_available():
    raise RuntimeError("Notebook n√†y y√™u c·∫ßu GPU ƒë·ªÉ ch·∫°y. Vui l√≤ng ki·ªÉm tra l·∫°i c·∫•u h√¨nh CUDA.")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

/bin/bash: line 1: nvidia-smi: command not found


PyTorch version: 2.7.1+cu126
CUDA available: False


In [None]:
preprocessor_train = PhoBERTTextPreprocessor(
        phobert_model='vinai/phobert-base',
        max_length=32
)

# Load training dataset
train_dataset = VietnameseNewsDataset(
    csv_file='data/preprocess/UIT-ViON_train_preprocessed.csv',
    preprocessor=preprocessor_train,
    max_header_length=20
)

# Load validation dataset (s·ª≠ d·ª•ng label_encoder t·ª´ train)
val_dataset = VietnameseNewsDataset(
    csv_file='data/preprocess/UIT-ViON_dev_preprocessed.csv',
    preprocessor=preprocessor_train,
    label_encoder=train_dataset.label_encoder,
    max_header_length=20
)

print(f"Training samples: {len(train_dataset):,}")
print(f"Validation samples: {len(val_dataset):,}")
print(f"Number of classes: {len(train_dataset.label_encoder.classes_)}")
print(f"Classes: {train_dataset.label_encoder.classes_}")

>> Loading PhoBERT tokenizer from local: models/phobert-base


Training samples: 206,411
Validation samples: 25,967
Number of classes: 13
Classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12]


In [None]:
device = 'cuda'

# GPU Configuration
gpu_name = torch.cuda.get_device_name(0)
gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)

print(f"‚úÖ Training on GPU: {gpu_name}")
print(f"   GPU Memory: {gpu_memory_gb:.1f} GB\n")

NUM_WORKERS = 2
# c√°c tham s·ªë c√≥ th√™ t·ªëi ∆∞u
BATCH_SIZE = 16 # [16, 24, 32]
DROPOUT_RATE = 0.4 # [0.1, 0.4, 0.5]
# ========================================
# HYPERPARAMETERS
# ========================================
LEARNING_RATE = 2e-5 #[1e-5, 2e-5, 3e-5, 5e-5]
WEIGHT_DECAY = 0.01 # (0.0 -> 0.1)
NUM_EPOCHS = 4

ACCUMULATION_STEPS = 1

USE_PIN_MEMORY = True
PERSISTENT_WORKERS = True if NUM_WORKERS > 0 else False
PREFETCH_FACTOR = 2

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=preprocessor_train.tokenizer)

# DataLoader configuration
dataloader_kwargs = {
    'collate_fn': data_collator,
    'pin_memory': USE_PIN_MEMORY,
    'num_workers': NUM_WORKERS,
    'persistent_workers': PERSISTENT_WORKERS,
    'prefetch_factor': PREFETCH_FACTOR
}

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    **dataloader_kwargs
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE * 2,
    shuffle=False,
    **dataloader_kwargs
)

# Print configuration summary
print(f"{'='*70}")
print(f"DATALOADER CONFIGURATION")
print(f"{'='*70}")
print(f"  Device:              {device.upper()}")
print(f"  Batch size:          {BATCH_SIZE}")
print(f"  Accumulation steps:  {ACCUMULATION_STEPS}")
print(f"  Effective batch:     {BATCH_SIZE * ACCUMULATION_STEPS}")
print(f"  Num workers:         {NUM_WORKERS}")
print(f"  Training batches:    {len(train_loader):,}")
print(f"  Validation batches:  {len(val_loader):,}")
print(f"{'='*70}")
print(f"\nüöÄ GPU Training Optimizations Enabled:")
print(f"  ‚Ä¢ Mixed precision training (FP16)")
print(f"  ‚Ä¢ Pin memory for faster data transfer")
print(f"  ‚Ä¢ Persistent workers to reduce overhead")


Configuration Summary:
  Device:              CPU
  Batch size:          16
  Effective batch:     32
  Num workers:         2
  Pin memory:          False
  Training batches:    12,901
  Validation batches:  812

CPU Training Tips:
  - Training will be 10-20x slower than GPU
  - Estimated: ~15-20 minutes/epoch with 206K samples
  - Consider reducing NUM_EPOCHS to 2


In [None]:
print("\n=== Initialize Model & Trainer ===")

# ========================================
# MODEL CONFIGURATION
# ========================================
FREEZE_PHOBERT = False
    
# Initialize model
num_classes = len(train_dataset.label_encoder.classes_)
model_train = PhoBERTClassifier(
    num_classes=num_classes,
    phobert_model='vinai/phobert-base',
    dropout_rate=DROPOUT_RATE,
    hidden_size=768,
    freeze_phobert=FREEZE_PHOBERT
)

trainable_params = sum(p.numel() for p in model_train.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model_train.parameters())

if FREEZE_PHOBERT:
    print(f"Mode: Freeze PhoBERT (train FC layer only)")
    print(f"Trainable: {trainable_params:,} / {total_params:,} parameters")
else:
    print(f"Mode: Full fine-tuning")
    print(f"Trainable: {trainable_params:,} parameters")

# Initialize trainer
trainer = PhoBERTTrainer(
    model=model_train,
    device='cuda'
)


=== Initialize Model & Trainer ===
>> Loading PhoBERT model from local: models/phobert-base
Mode: Full fine-tuning
Trainable: 135,008,269 parameters

Using device: cpu


In [None]:
print("\n=== Setup Optimizer & Learning Rate ===")


# Optimizer
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model_train.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': WEIGHT_DECAY
    },
    {
        'params': [p for n, p in model_train.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=LEARNING_RATE,
    eps=1e-8
)

# Learning rate scheduler
total_steps = NUM_EPOCHS * len(train_loader) // ACCUMULATION_STEPS
warmup_steps = int(0.1 * total_steps)

scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print(f"Learning rate: {LEARNING_RATE}")
print(f"Weight decay: {WEIGHT_DECAY}")
print(f"Number of epochs: {NUM_EPOCHS}")
print(f"Total training steps: {total_steps:,}")
print(f"Warmup steps: {warmup_steps:,}")
print(f"Gradient accumulation: {ACCUMULATION_STEPS}")
print(f"Scheduler: Cosine Annealing")


=== Setup Optimizer & Learning Rate ===
Learning rate: 2e-05
Weight decay: 0.05
Number of epochs: 3
Total training steps: 19,351
Warmup steps: 1,935
Gradient accumulation: 2
Estimated training time: 45-60 minutes


## üöÄ Advanced Improvements Applied

ƒê·ªÉ c·∫£i thi·ªán hi·ªáu su·∫•t m√¥ h√¨nh v√† gi·∫£m thi·ªÉu overfitting t·ªët h∆°n n·ªØa, t√¥i ƒë√£ √°p d·ª•ng c√°c k·ªπ thu·∫≠t n√¢ng cao sau:

1.  **Label Smoothing (0.1):**
    *   Thay v√¨ √©p m√¥ h√¨nh d·ª± ƒëo√°n x√°c su·∫•t 1.0 cho ƒë√∫ng class (hard target), ch√∫ng ta s·ª≠ d·ª•ng soft target (v√≠ d·ª•: 0.9 cho ƒë√∫ng class, chia ƒë·ªÅu 0.1 cho c√°c class c√≤n l·∫°i).
    *   **T√°c d·ª•ng:** Gi√∫p m√¥ h√¨nh b·ªõt "t·ª± tin th√°i qu√°", h·ªçc ƒë∆∞·ª£c c√°c ƒë·∫∑c tr∆∞ng t·ªïng qu√°t h∆°n v√† gi·∫£m overfitting.

2.  **Correct Weight Decay Strategy:**
    *   Ch·ªâ √°p d·ª•ng Weight Decay cho c√°c tr·ªçng s·ªë (weights) c·ªßa Linear layers v√† Embeddings.
    *   **KH√îNG** √°p d·ª•ng cho Bias v√† LayerNorm weights.
    *   **T√°c d·ª•ng:** ƒê√¢y l√† chu·∫©n m·ª±c khi fine-tune BERT, gi√∫p training ·ªïn ƒë·ªãnh h∆°n.

3.  **Cosine Learning Rate Scheduler:**
    *   Thay v√¨ gi·∫£m tuy·∫øn t√≠nh (Linear), LR s·∫Ω gi·∫£m theo h√¨nh cosin.
    *   **T√°c d·ª•ng:** Gi·ªØ LR cao l√¢u h∆°n ·ªü giai ƒëo·∫°n ƒë·∫ßu ƒë·ªÉ h·ªçc nhanh, v√† gi·∫£m r·∫•t ch·∫≠m/m∆∞·ª£t v·ªÅ cu·ªëi ƒë·ªÉ h·ªôi t·ª• ch√≠nh x√°c v√†o ƒëi·ªÉm c·ª±c tr·ªã.

4.  **Hyperparameters Adjustment:**
    *   **Epochs:** TƒÉng nh·∫π l√™n 4 ƒë·ªÉ Cosine Scheduler c√≥ ƒë·ªß chu k·ª≥ ho·∫°t ƒë·ªông hi·ªáu qu·∫£.
    *   **Weight Decay:** ƒêi·ªÅu ch·ªânh v·ªÅ 0.01 (chu·∫©n cho AdamW khi ƒë√£ group parameters ƒë√∫ng c√°ch).

In [None]:
print("\n" + "="*80)
print(f"START TRAINING ON CUDA")
print("="*80)

import time

# ========================================
# TRAINING LOOP WITH EARLY STOPPING
# ========================================

best_val_loss = float('inf')
best_val_acc = 0.0
patience = 2
patience_counter = 0
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

total_training_start = time.time()

for epoch in range(NUM_EPOCHS):
    epoch_start_time = time.time()
    print(f"\n{'='*80}")
    print(f"EPOCH {epoch + 1}/{NUM_EPOCHS}")
    print("="*80)

    # TRAINING
    train_loss, train_acc = trainer.train_epoch(
        train_loader=train_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        accumulation_steps=ACCUMULATION_STEPS,
        use_mixed_precision=True, # Always True for CUDA
        max_grad_norm=1.0,
        show_progress=True
    )

    # VALIDATION
    print("\nEvaluating on validation set...")
    val_loss, val_acc = trainer.evaluate(
        val_loader=val_loader,
        show_progress=True,
        use_mixed_precision=True
    )

    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    # Print results
    epoch_time = time.time() - epoch_start_time
    print(f"\nEPOCH {epoch + 1} RESULTS:")
    print(f"{'-'*80}")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} ({train_acc*100:.2f}%)")
    print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f} ({val_acc*100:.2f}%)")
    print(f"  Gap:        Loss delta={abs(train_loss-val_loss):.4f} | Acc delta={abs(train_acc-val_acc)*100:.2f}%")
    print(f"  Time:       {epoch_time/60:.2f} minutes ({epoch_time:.0f} seconds)")
    print(f"{'-'*80}")

    # EARLY STOPPING
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_acc = val_acc
        patience_counter = 0
        
        # Save best model
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model_train.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'val_loss': val_loss,
            'val_acc': val_acc,
            'label_encoder': train_dataset.label_encoder,
            'history': history,
            'config': {
                'num_classes': num_classes,
                'batch_size': BATCH_SIZE,
                'learning_rate': LEARNING_RATE,
                'num_epochs': NUM_EPOCHS,
                'dropout_rate': 0.5,
                'weight_decay': WEIGHT_DECAY,
                'device': 'cuda'
            }
        }
        torch.save(checkpoint, 'best_phobert_model.pth')
        print(f"  >> Saved best model (Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f})")
    else:
        patience_counter += 1
        improvement = ((best_val_loss - val_loss) / best_val_loss) * 100
        print(f"  >> Val loss did NOT improve (increased {-improvement:.2f}%) - Patience: {patience_counter}/{patience}")
        
        if patience_counter >= patience:
            print(f"\n{'='*80}")
            print(f"EARLY STOPPING ACTIVATED")
            print(f"Val loss did not improve after {patience} epochs")
            print(f"Best Val Loss: {best_val_loss:.4f} (Epoch {epoch - patience_counter + 1})")
            print(f"Stopping training to prevent overfitting")
            print(f"{'='*80}\n")
            break

    # Clear cache
    torch.cuda.empty_cache()

total_training_time = time.time() - total_training_start

print("\n" + "="*80)
print("TRAINING COMPLETED")
print("="*80)
print(f"Device: CUDA")
print(f"Best Validation Loss: {best_val_loss:.4f}")
print(f"Best Validation Accuracy: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")
print(f"Total training time: {total_training_time/60:.1f} minutes ({total_training_time:.0f} seconds)")
print(f"Average time per epoch: {total_training_time/len(history['train_loss'])/60:.1f} minutes")
print(f"Total epochs completed: {len(history['train_loss'])}/{NUM_EPOCHS}")
print(f"Model saved at: best_phobert_model.pth")
print("="*80)


START TRAINING ON CPU

EPOCH 1/3


Training:   0%|         | 16/12901 [01:47<24:04:08,  6.72s/it, loss=2.8804, acc=0.0664, lr=8.27e-08]


KeyboardInterrupt: 

## Overfitting Prevention Measures

**4 Implemented Actions:**

1. **Early Stopping (patience=2)**
   - Stop training when val_loss doesn't improve after 2 epochs
   - Save model based on val_loss (not val_acc)
   - Display detailed information when stopping

2. **Increased Dropout: 0.3 ‚Üí 0.5**
   - Stronger regularization for FC layer
   - Reduce overfitting by random dropping neurons

3. **Increased Weight Decay: 0.01 ‚Üí 0.05**
   - Stronger L2 regularization
   - Penalty large weights to avoid overly complex model

4. **Reduced NUM_EPOCHS: 5 ‚Üí 3**
   - Reduce training time
   - Combined with early stopping to stop at right time

**Expected Results:**
- Val loss decreases or stabilizes (NOT increase like before)
- Gap between train/val < 5%
- Better generalization on test set

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot Loss
ax1.plot(history['train_loss'], label='Train Loss', marker='o')
ax1.plot(history['val_loss'], label='Val Loss', marker='s')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training & Validation Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot Accuracy
ax2.plot([acc*100 for acc in history['train_acc']], label='Train Acc', marker='o')
ax2.plot([acc*100 for acc in history['val_acc']], label='Val Acc', marker='s')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Training & Validation Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nTraining Summary:")
print(f"  Best epoch: {history['val_acc'].index(max(history['val_acc'])) + 1}")
print(f"  Best val accuracy: {max(history['val_acc']):.4f} ({max(history['val_acc'])*100:.2f}%)")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

print("\n" + "="*80)
print("DETAILED EVALUATION")
print("="*80)

# Load best model
print("Loading best model...")
checkpoint = torch.load('best_phobert_model.pth', weights_only=False)
model_train.load_state_dict(checkpoint['model_state_dict'])
print(f"Loaded model from epoch {checkpoint['epoch']+1} with val_acc={checkpoint['val_acc']:.4f}")

# Predict on validation set
print("\nPredicting on validation set...")
y_pred, y_true = trainer.predict_from_loader(val_loader)

# Classification Report
print("\nClassification Report:")
print("-" * 80)
print(classification_report(
    y_true, 
    y_pred, 
    target_names=train_dataset.label_encoder.classes_,
    digits=4
))

# Confusion Matrix
print("\nConfusion Matrix:")
print("-" * 80)
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=train_dataset.label_encoder.classes_,
    yticklabels=train_dataset.label_encoder.classes_
)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*80)
print("TEST PREDICTION")
print("="*80)

# Load best model
print("Loading best model...")
checkpoint = torch.load('best_phobert_model.pth', weights_only=False)
model_train.load_state_dict(checkpoint['model_state_dict'])
print(f"Loaded model from epoch {checkpoint['epoch']+1} with val_acc={checkpoint['val_acc']:.4f}")

# Test with sample texts
test_texts = [
    "√¥_nhi·ªÖm ti·∫øng ·ªìn √¢m_th·∫ßm ti√™u_di·ªát sinh_v·∫≠t bi·ªÉn",
    "to√†n_c·∫ßu nu√¥i 8 t·ª∑ mi·ªáng_ƒÉn trong ƒë·∫°i_d·ªãch nh∆∞_th·∫ø_n√†o",
    "th√°ch_th·ª©c lao_ƒë·ªông vi·ªát ƒë√≥n s√≥ng chuy·ªÉn_d·ªãch nh·∫≠t_b·∫£n",
    "vi·ªát_trinh l√†m vedette",
    "m·ªπ b·∫≠t_ƒë√®n_xanh b√°n m√°y_bay chi·∫øn_ƒë·∫•u hi·ªán_ƒë·∫°i f 35 singapore",
    "ch·∫∑t ch√©m ti·ªÅn g·ª≠i xe m√πa ph√°o_hoa g·ªçi ai",
    "15 ti√™u_ch√≠ ƒë√°nh_gi√° an_to√†n ph√≤ng covid 19 ƒë·ªëi_v·ªõi tr∆∞·ªùng_h·ªçc"
]

print("\nTest predictions:")
print("-"*80)

predictions = trainer.predict(
    texts=test_texts,
    preprocessor=preprocessor_train,
    batch_size=8
)

label_encoder = checkpoint['label_encoder']
for i, (text, pred_idx) in enumerate(zip(test_texts, predictions), 1):
    pred_label = label_encoder.inverse_transform([pred_idx])[0]
    print(f"{i}. Text: {text}")
    print(f"   Predicted: {pred_label}")
    print()

print("="*80)
print("COMPLETED")
print("="*80)

# Download model (optional)
print("\nDownload model:")
print("  Uncomment the lines below to download:")
print("  # from google.colab import files")
print("  # files.download('best_phobert_model.pth')")