In [9]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from typing import Any, Dict
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import optuna
from optuna.integration import PyTorchLightningPruningCallback

# Set random seeds for reproducibility
pl.seed_everything(42, workers=True)



Seed set to 42


42

In [10]:
class BoolQDataset(Dataset):
    def __init__(self, data: Dict[str, Any], tokenizer: AutoTokenizer, max_length: int = 512):
        """
        Args:
            data: Dictionary containing the text and labels.
            tokenizer: Pretrained tokenizer for text processing.
            max_length: Maximum token sequence length.
        """
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.data["question"])

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        # Get question and passage
        question = self.data["question"][idx]
        passage = self.data["passage"][idx]
        label = self.data["answer"][idx]

        # Tokenize
        encoded = self.tokenizer(
            question,
            passage,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Correctness tests for tokenization
        assert encoded["input_ids"].shape[-1] <= self.max_length, "Token length exceeds max_length!"
        assert encoded["input_ids"].shape == encoded["attention_mask"].shape, "Mismatch in token shapes!"

        return {
            "input_ids": encoded["input_ids"].squeeze(0),  # Remove batch dimension
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float),  # Float for binary classification
        }

In [11]:
class BoolQDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer_name: str, batch_size: int = 16, max_length: int = 512):
        """
        Args:
            tokenizer_name: Name of the pretrained tokenizer.
            batch_size: Number of samples per batch.
            max_length: Maximum sequence length.
        """
        super().__init__()
        self.tokenizer_name = tokenizer_name
        self.batch_size = batch_size
        self.max_length = max_length

    def prepare_data(self) -> None:
        # Load dataset
        # Loading the dataset based on the given splits
        self.train_data = load_dataset("google/boolq", split="train[:-1000]")
        self.validation_data = load_dataset("google/boolq", split="train[-1000:]")
        self.test_data = load_dataset("google/boolq", split="validation")

    def setup(self, stage: str = None) -> None:
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)

        # Create datasets
        self.train_dataset = BoolQDataset(self.train_data, self.tokenizer, self.max_length)
        self.val_dataset = BoolQDataset(self.validation_data, self.tokenizer, self.max_length)
        self.test_dataset = BoolQDataset(self.test_data, self.tokenizer, self.max_length)

        # Correctness tests for dataset splits
        assert len(self.train_dataset) > 0, "Train dataset is empty!"
        assert len(self.val_dataset) > 0, "Validation dataset is empty!"
        assert len(self.test_dataset) > 0, "Test dataset is empty!"

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self) -> DataLoader:
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

# Initialize DataModule
data_module = BoolQDataModule(tokenizer_name="bert-large-cased", batch_size=16)

# Prepare and test data loading
data_module.prepare_data()
data_module.setup()

# Correctness test for DataLoader
for batch in data_module.train_dataloader():
    assert batch["input_ids"].shape[0] == 16, "Batch size mismatch!"
    print(f"Batch loaded successfully with shape: {batch['input_ids'].shape}")
    break



Batch loaded successfully with shape: torch.Size([16, 512])


In [13]:
class BoolQClassifier(pl.LightningModule):
    def __init__(self, model_name: str, learning_rate: float = 1e-5, hidden_dim: int = 256, dropout_rate: float = 0.3):
        super(BoolQClassifier, self).__init__()
        self.save_hyperparameters()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
        self.loss_fn = nn.BCELoss()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.classifier(cls_output)
        return logits.squeeze(-1)

    def training_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        logits = self(batch['input_ids'], batch['attention_mask'])
        loss = self.loss_fn(logits, batch['label'])
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> Dict[str, Any]:
        logits = self(batch['input_ids'], batch['attention_mask'])
        loss = self.loss_fn(logits, batch['label'])
        preds = (logits > 0.5).float()
        acc = (preds == batch['label']).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return {'val_loss': loss, 'val_acc': acc}

    def configure_optimizers(self) -> torch.optim.Optimizer:
        """
        Configure optimizers with different learning rates for the Transformer and classifier layers.
        """
        # Separate parameter groups
        transformer_params = list(self.bert.parameters())
        classifier_params = list(self.classifier.parameters())
    
        # Define learning rates
        transformer_lr = self.hparams.learning_rate  # Base learning rate
        classifier_lr = self.hparams.learning_rate * 10  # Higher learning rate for classifier
    
        # Create parameter groups
        optimizer = torch.optim.AdamW([
            {'params': transformer_params, 'lr': transformer_lr},
            {'params': classifier_params, 'lr': classifier_lr}
        ])
    
        return optimizer




In [16]:
# Initialize DataModule
data_module = BoolQDataModule(tokenizer_name="bert-large-cased", batch_size=32)

# Initialize Model with manual hyperparameters
model = BoolQClassifier(
    model_name="bert-large-cased",
    learning_rate=2e-5,
    hidden_dim=256,
    dropout_rate=0.3
)

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')
checkpoint = ModelCheckpoint(monitor='val_acc', mode='max', save_top_k=1, filename='best_model')

# Trainer
trainer = Trainer(
    max_epochs=100,
    callbacks=[early_stopping, checkpoint],
    accelerator='gpu',
    devices=1  # Use GPU if available
)

# Train
trainer.fit(model, datamodule=data_module)

def objective(trial: optuna.Trial) -> float:
    # Suggest hyperparameters
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
    hidden_dim = trial.suggest_int('hidden_dim', 128, 512, step=64)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)

    # Initialize Model
    model = BoolQClassifier(
        model_name="bert-large-cased",
        learning_rate=learning_rate,
        hidden_dim=hidden_dim,
        dropout_rate=dropout_rate
    )

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')
    checkpoint = ModelCheckpoint(monitor='val_acc', mode='max', save_top_k=1, filename='best_model')
    pruning_callback = PyTorchLightningPruningCallback(trial, monitor='val_loss')

    # Trainer
    trainer = Trainer(
        max_epochs=100,
        callbacks=[early_stopping, checkpoint, pruning_callback],
        accelerator='gpu',
        devices=1,
        logger=False  # Disable logging for Optuna trials
    )

    # Train
    trainer.fit(model, datamodule=data_module)

    # Retrieve best score
    return trainer.callback_metrics['val_acc'].item()

# Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/blackbook/anaconda3/envs/nlp/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name       | Type       | Params | Mode 
--------------------------------------------------
0 | bert       | BertModel  | 333 M  | eval 
1 | classifier | Sequential | 262 K  | train
2 | loss_fn    | BCELoss    | 0      | train
--------------------------------------------------
333 M     Trainable params
0         Non-trainable pa

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/blackbook/anaconda3/envs/nlp/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/blackbook/anaconda3/envs/nlp/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined