In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from src.models import *

In [2]:
available = torch.cuda.is_available()
curr_device = torch.cuda.current_device()
device = torch.device("cuda:0" if available else "cpu") 
device_count = torch.cuda.device_count() 
device_name =  torch.cuda.get_device_name(0)

print(f'Cuda available: {available}')
print(f'Current device: {curr_device}')
print(f'Device: {device}')
print(f'Device count: {device_count}')
print(f'Device name: {device_name}')


Cuda available: True
Current device: 0
Device: cuda:0
Device count: 1
Device name: NVIDIA GeForce RTX 3090


In [3]:
a = torch.rand(2, 2, 4) * 2
a

tensor([[[1.5975, 0.4855, 1.4737, 0.7699],
         [1.7126, 1.7917, 0.1524, 0.2950]],

        [[1.2589, 1.3450, 0.1758, 1.2254],
         [0.8413, 1.1939, 1.7003, 1.3048]]])

In [4]:
SelfAttention(4, 2)(a)

tensor([[[-0.3609, -0.1882, -0.2390, -0.0220],
         [-0.3520, -0.1843, -0.2294, -0.0346]],

        [[-0.2381, -0.2981, -0.2113, -0.2449],
         [-0.2330, -0.2940, -0.2039, -0.2541]]], grad_fn=<ViewBackward0>)

In [5]:
TransformerBlock(4, 2, 8)(a)

tensor([[[ 1.0400, -1.0413, -0.9570,  0.9583],
         [ 0.4257,  0.9135, -1.6912,  0.3521]],

        [[-0.0557,  0.6907, -1.6055,  0.9706],
         [-1.1947,  0.9214, -0.7816,  1.0549]]],
       grad_fn=<NativeLayerNormBackward0>)

In [6]:
from pytorch_lightning import LightningModule, Trainer, seed_everything
from src.dataloaders import GLUEDataModule

seed_everything(42)
dm = GLUEDataModule(model_name_or_path='bert-large-uncased', 
                    task_name='cola',
                    train_batch_size=32,
                    eval_batch_size=32)
dm.setup('fit')

  warn(f"Failed to load image Python extension: {e}")
  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 42
Found cached dataset glue (/home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 379.54it/s]
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1afe93c2c61c7931.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1f28bd522d35d185.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a8f0d4e0f4309ddf.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim


class TestModel(LightningModule):
    def __init__(self, adamw_params=None, **kwargs):
        super().__init__()

        self.model = VanillaBert(**kwargs)
        self.adamw_params = adamw_params
        
    def forward(self, **kwargs):
        return self.model(kwargs['input_ids'])
    
    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = F.binary_cross_entropy_with_logits(outputs, batch['labels'].float())
        return loss
    
    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('train_loss', loss, prog_bar=True)
    
    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss = F.binary_cross_entropy_with_logits(outputs, batch['labels'].float())
        return {'loss': val_loss}

    def validation_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('val_loss', loss, prog_bar=True)
    
    def configure_optimizers(self):
        '''Prepare optimizer and schedule (linear warmup and decay)'''
        adamw_params = {
            'lr': 0.0001, 
            'betas': (0.9, 0.98), 
            'eps': 10e-12,
            'weight_decay': 0.01,
        }
        if self.adamw_params:
            adamw_params.update(self.adamw_params)
        optimizer = optim.AdamW(self.model.parameters(), **adamw_params)
        return optimizer

In [19]:
from pytorch_lightning import LightningModule, Trainer, seed_everything

seed_everything(42)
adamw_params = {
    'lr': 0.0001,
    'betas': (0.9, 0.999),
    'eps': 10e-8,
    'weight_decay': 0.01,
}
model = TestModel(vocab_size=dm.tokenizer.vocab_size, 
                  n_blocks=1,
                  adamw_params=adamw_params,)

# BK: Using val_loss to pick best model for simplicity here
trainer = Trainer(
    max_epochs=5,
    accelerator='auto',
    devices=1 if torch.cuda.is_available() else None,
)
trainer.fit(model, datamodule=dm)

Global seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Found cached dataset glue (/home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1344.47it/s]
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Found cached dataset glue (/home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1199.17it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type     

Epoch 4: 100%|██████████| 301/301 [00:04<00:00, 61.49it/s, loss=0.532, v_num=37, val_loss=0.659, train_loss=0.528]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 301/301 [00:05<00:00, 57.01it/s, loss=0.532, v_num=37, val_loss=0.659, train_loss=0.528]
