# IMDB Movie Review Sentiment Analysis - Model Training and Evaluation

This notebook demonstrates how to train and evaluate a BERT-based sentiment analysis model on the IMDB Movie Review dataset.

## Setup

First, let's import the necessary libraries and set up the environment.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import yaml
import json
from pathlib import Path
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import project modules
from src.data.processor import get_data_processor
from src.models.bert_classifier import get_model, save_model, load_model
from src.utils.config import load_config
from src.utils.metrics import compute_metrics, log_metrics, get_classification_report
from src.utils.visualization import (
    set_plotting_style,
    save_figure,
    plot_training_history,
    plot_confusion_matrix,
    plot_roc_curve,
)

# Set plotting style
set_plotting_style()

# Create output directories
models_dir = os.path.join(project_root, 'models')
results_dir = os.path.join(project_root, 'models', 'results')
visualizations_dir = os.path.join(project_root, 'models', 'visualizations')

os.makedirs(models_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)
os.makedirs(visualizations_dir, exist_ok=True)

## Configuration

Let's load the configuration files for the project.

In [None]:
# Load configuration
train_config_path = os.path.join(project_root, 'configs', 'train.yaml')
data_config_path = os.path.join(project_root, 'configs', 'data_config.yaml')

train_config = load_config(train_config_path)
data_config = load_config(data_config_path)

# Display configurations
print("Training Configuration:")
print(yaml.dump(train_config, default_flow_style=False))

print("\nData Configuration:")
print(yaml.dump(data_config, default_flow_style=False))

## Data Preparation

Let's prepare the IMDB dataset for training and evaluation.

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Initialize tokenizer
model_name = train_config['model']['name']
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Initialized tokenizer: {model_name}")

In [None]:
# Initialize data processor
data_processor = get_data_processor(data_config, tokenizer, model_name)
print("Initialized data processor")

In [None]:
# Load and prepare dataset
print("Loading dataset...")
try:
    # Try to load from processed directory
    train_dataset, val_dataset, test_dataset = data_processor.prepare_dataset(use_processed=True)
    print("Loaded processed dataset")
except Exception as e:
    print(f"Error loading processed dataset: {e}")
    print("Loading dataset from Hugging Face...")
    
    # Load from Hugging Face
    imdb_dataset = load_dataset('imdb')
    
    # Convert to pandas DataFrame
    train_df = pd.DataFrame(imdb_dataset['train'])
    test_df = pd.DataFrame(imdb_dataset['test'])
    
    # Split train into train and validation
    from sklearn.model_selection import train_test_split
    
    train_df, val_df = train_test_split(
        train_df,
        test_size=data_config['splitting']['val_size'],
        random_state=data_config['splitting']['random_state'],
        stratify=train_df['label'] if data_config['splitting']['stratify'] else None
    )
    
    # Tokenize datasets
    def tokenize_dataset(df):
        texts = df['text'].tolist()
        labels = df['label'].tolist()
        
        encodings = tokenizer(
            texts,
            padding=data_config['preprocessing']['padding'],
            truncation=data_config['preprocessing']['truncation'],
            max_length=data_config['preprocessing']['max_seq_length'],
            return_tensors='pt'
        )
        
        dataset = torch.utils.data.TensorDataset(
            encodings['input_ids'],
            encodings['attention_mask'],
            torch.tensor(labels)
        )
        
        return dataset
    
    train_dataset = tokenize_dataset(train_df)
    val_dataset = tokenize_dataset(val_df)
    test_dataset = tokenize_dataset(test_df)
    
    print("Dataset loaded and tokenized")

In [None]:
# Create data loaders
train_dataloader, val_dataloader, test_dataloader = data_processor.create_dataloaders(
    train_dataset, val_dataset, test_dataset
)

print(f"Train batches: {len(train_dataloader)}")
print(f"Validation batches: {len(val_dataloader)}")
print(f"Test batches: {len(test_dataloader)}")

## Model Initialization

Let's initialize the BERT-based sentiment classifier model.

In [None]:
# Initialize model
model = get_model(
    model_name=model_name,
    num_labels=train_config['model']['num_labels'],
    dropout_rate=train_config['model']['dropout_rate'],
    gradient_checkpointing=train_config['model']['gradient_checkpointing'],
    device=device
)

print(f"Initialized model: {model_name}")

In [None]:
# Define optimizer and scheduler
from transformers import get_linear_schedule_with_warmup

# Get optimizer parameters
optimizer_config = train_config['optimizer']
learning_rate = optimizer_config.get('learning_rate', 2e-5)
weight_decay = optimizer_config.get('weight_decay', 0.01)
adam_epsilon = optimizer_config.get('adam_epsilon', 1e-8)
adam_betas = optimizer_config.get('adam_betas', (0.9, 0.999))

# Prepare optimizer
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': weight_decay
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

# Create optimizer
optimizer = torch.optim.AdamW(
    optimizer_grouped_parameters,
    lr=learning_rate,
    eps=adam_epsilon,
    betas=adam_betas
)

# Calculate total number of training steps
num_epochs = train_config['training']['num_epochs']
total_steps = len(train_dataloader) * num_epochs

# Calculate number of warmup steps
warmup_ratio = optimizer_config.get('warmup_ratio', 0.1)
warmup_steps = int(total_steps * warmup_ratio)

# Create scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print(f"Optimizer: AdamW, Learning rate: {learning_rate}, Weight decay: {weight_decay}")
print(f"Scheduler: Linear, Warmup steps: {warmup_steps}/{total_steps}")

## Training Loop

Let's train the model on the IMDB dataset.

In [None]:
# Training function
def train_epoch(model, dataloader, optimizer, scheduler, device, fp16=False):
    model.train()
    
    # Initialize metrics
    epoch_loss = 0.0
    epoch_accuracy = 0.0
    epoch_steps = 0
    
    # Initialize scaler for mixed precision training
    scaler = torch.cuda.amp.GradScaler() if fp16 else None
    
    # Training loop
    progress_bar = tqdm(dataloader, desc="Training")
    
    for batch in progress_bar:
        # Move batch to device
        if isinstance(batch, dict):
            batch = {k: v.to(device) for k, v in batch.items()}
        else:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            batch = {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }
        
        # Forward pass with mixed precision if enabled
        if fp16:
            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    token_type_ids=batch.get('token_type_ids'),
                    labels=batch['labels']
                )
                loss = outputs['loss']
        else:
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                token_type_ids=batch.get('token_type_ids'),
                labels=batch['labels']
            )
            loss = outputs['loss']
        
        # Backward pass with mixed precision if enabled
        if fp16:
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        
        # Update learning rate
        scheduler.step()
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Update metrics
        epoch_loss += loss.item()
        
        # Compute accuracy
        logits = outputs['logits']
        predictions = torch.argmax(logits, dim=1)
        accuracy = (predictions == batch['labels']).float().mean().item()
        epoch_accuracy += accuracy
        
        # Update progress bar
        epoch_steps += 1
        progress_bar.set_postfix({
            'loss': epoch_loss / epoch_steps,
            'accuracy': epoch_accuracy / epoch_steps
        })
    
    # Compute epoch metrics
    epoch_metrics = {
        'loss': epoch_loss / epoch_steps,
        'accuracy': epoch_accuracy / epoch_steps
    }
    
    return epoch_metrics

In [None]:
# Evaluation function
def evaluate(model, dataloader, device, fp16=False):
    model.eval()
    
    # Initialize metrics
    eval_loss = 0.0
    eval_steps = 0
    
    # Initialize lists for predictions and labels
    all_predictions = []
    all_labels = []
    all_logits = []
    
    # Evaluation loop
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating")
        
        for batch in progress_bar:
            # Move batch to device
            if isinstance(batch, dict):
                batch = {k: v.to(device) for k, v in batch.items()}
            else:
                input_ids, attention_mask, labels = batch
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                batch = {
                    'input_ids': input_ids,
                    'attention_mask': attention_mask,
                    'labels': labels
                }
            
            # Forward pass with mixed precision if enabled
            if fp16:
                with torch.cuda.amp.autocast():
                    outputs = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        token_type_ids=batch.get('token_type_ids'),
                        labels=batch['labels']
                    )
                    loss = outputs['loss']
            else:
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    token_type_ids=batch.get('token_type_ids'),
                    labels=batch['labels']
                )
                loss = outputs['loss']
            
            # Update metrics
            eval_loss += loss.item()
            
            # Get predictions and labels
            logits = outputs['logits']
            predictions = torch.argmax(logits, dim=1)
            
            # Append to lists
            all_predictions.append(predictions.cpu().numpy())
            all_labels.append(batch['labels'].cpu().numpy())
            all_logits.append(logits.cpu().numpy())
            
            # Update progress bar
            eval_steps += 1
            progress_bar.set_postfix({'loss': eval_loss / eval_steps})
    
    # Concatenate predictions and labels
    all_predictions = np.concatenate(all_predictions)
    all_labels = np.concatenate(all_labels)
    all_logits = np.concatenate(all_logits)
    
    # Compute metrics
    metrics = compute_metrics(all_predictions, all_labels)
    metrics['loss'] = eval_loss / eval_steps
    
    return metrics, all_predictions, all_labels, all_logits

In [None]:
# Training loop
print("Starting training...")

# Initialize variables for early stopping
best_val_metric = float("-inf")
best_epoch = 0
patience = train_config['training']['early_stopping_patience']
patience_counter = 0

# Initialize training history
history = {
    'loss': [],
    'accuracy': [],
    'val_loss': [],
    'val_accuracy': [],
    'val_f1': []
}

# Training loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train for one epoch
    train_metrics = train_epoch(
        model,
        train_dataloader,
        optimizer,
        scheduler,
        device,
        fp16=train_config['training']['fp16']
    )
    
    # Log training metrics
    print(f"Training metrics:")
    for name, value in train_metrics.items():
        print(f"  {name}: {value:.4f}")
    
    # Update history
    history['loss'].append(train_metrics['loss'])
    history['accuracy'].append(train_metrics['accuracy'])
    
    # Evaluate on validation set
    print("\nEvaluating on validation set...")
    val_metrics, val_predictions, val_labels, val_logits = evaluate(
        model,
        val_dataloader,
        device,
        fp16=train_config['training']['fp16']
    )
    
    # Log validation metrics
    print(f"Validation metrics:")
    for name, value in val_metrics.items():
        print(f"  {name}: {value:.4f}")
    
    # Update history
    history['val_loss'].append(val_metrics['loss'])
    history['val_accuracy'].append(val_metrics['accuracy'])
    history['val_f1'].append(val_metrics['f1'])
    
    # Check if this is the best model so far
    monitor_metric = val_metrics[train_config['logging']['monitor']]
    
    if (
        train_config['logging']['mode'] == "max" and monitor_metric > best_val_metric
    ) or (
        train_config['logging']['mode'] == "min" and monitor_metric < best_val_metric
    ):
        print(
            f"New best model! {train_config['logging']['monitor']}: "
            f"{monitor_metric:.4f} (previous: {best_val_metric:.4f})"
        )
        
        # Update best metric and epoch
        best_val_metric = monitor_metric
        best_epoch = epoch
        
        # Reset patience counter
        patience_counter = 0
        
        # Save model
        print(f"Saving model to {models_dir}")
        save_model(model, tokenizer, models_dir, "best_model")
        
        # Save training history
        with open(os.path.join(models_dir, "history.json"), "w") as f:
            json.dump(history, f)
    else:
        # Increment patience counter
        patience_counter += 1
        
        print(
            f"No improvement over best model. "
            f"Patience: {patience_counter}/{patience}"
        )
        
        # Check if we should stop training
        if patience_counter >= patience:
            print(
                f"Early stopping triggered after {epoch + 1} epochs. "
                f"Best {train_config['logging']['monitor']}: {best_val_metric:.4f} "
                f"at epoch {best_epoch + 1}"
            )
            break

print("\nTraining complete!")

## Visualize Training History

Let's visualize the training history to see how the model performed during training.

In [None]:
# Plot training history
plot_training_history(history, visualizations_dir, filename="training_history")

# Display the plot
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Plot training and validation loss
axes[0].plot(history['loss'], label='Training Loss')
axes[0].plot(history['val_loss'], label='Validation Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()

# Plot training and validation accuracy
axes[1].plot(history['accuracy'], label='Training Accuracy')
axes[1].plot(history['val_accuracy'], label='Validation Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Validation Accuracy')
axes[1].legend()

plt.tight_layout()
plt.show()

## Evaluate on Test Set

Let's evaluate the best model on the test set.

In [None]:
# Load best model
best_model_path = os.path.join(models_dir, "best_model")
model, tokenizer = load_model(best_model_path, device)

print(f"Loaded best model from {best_model_path}")

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_metrics, test_predictions, test_labels, test_logits = evaluate(
    model,
    test_dataloader,
    device,
    fp16=train_config['training']['fp16']
)

# Log test metrics
print(f"Test metrics:")
for name, value in test_metrics.items():
    print(f"  {name}: {value:.4f}")

# Save test metrics
test_metrics_path = os.path.join(results_dir, "test_metrics.json")
with open(test_metrics_path, "w") as f:
    json.dump(test_metrics, f, indent=4)

# Get classification report
class_names = ['Negative', 'Positive']
report = get_classification_report(test_predictions, test_labels, class_names)
print(f"\nClassification report:\n{report}")

# Save classification report
report_path = os.path.join(results_dir, "test_classification_report.txt")
with open(report_path, "w") as f:
    f.write(report)

## Visualize Test Set Results

Let's visualize the results on the test set.

In [None]:
# Plot confusion matrix
plot_confusion_matrix(
    test_labels,
    test_predictions,
    class_names,
    visualizations_dir,
    filename="test_confusion_matrix"
)

# Display the plot
cm_fig, cm_ax = plt.subplots(figsize=(8, 6))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_labels, test_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, ax=cm_ax)
cm_ax.set_xlabel('Predicted Label')
cm_ax.set_ylabel('True Label')
cm_ax.set_title('Confusion Matrix - Test Set')
plt.show()

In [None]:
# Plot ROC curve
if len(class_names) == 2 and test_logits.shape[1] == 2:
    positive_probs = torch.softmax(torch.tensor(test_logits), dim=1)[:, 1].numpy()
    plot_roc_curve(
        test_labels,
        positive_probs,
        visualizations_dir,
        filename="test_roc_curve"
    )
    
    # Display the plot
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, _ = roc_curve(test_labels, positive_probs)
    roc_auc = auc(fpr, tpr)
    
    roc_fig, roc_ax = plt.subplots(figsize=(8, 6))
    roc_ax.plot(fpr, tpr, lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    roc_ax.plot([0, 1], [0, 1], 'k--', lw=2)
    roc_ax.set_xlim([0.0, 1.0])
    roc_ax.set_ylim([0.0, 1.05])
    roc_ax.set_xlabel('False Positive Rate')
    roc_ax.set_ylabel('True Positive Rate')
    roc_ax.set_title('Receiver Operating Characteristic (ROC) Curve - Test Set')
    roc_ax.legend(loc="lower right")
    plt.show()

## Conclusion

In this notebook, we've trained and evaluated a BERT-based sentiment analysis model on the IMDB dataset. The model's performance can be further improved by hyperparameter tuning, using a larger BERT model, or exploring other techniques like data augmentation.

The trained model and evaluation results are saved in the `models/` directory.