In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from src.models import *

In [2]:
available = torch.cuda.is_available()
curr_device = torch.cuda.current_device()
device = torch.device("cuda:0" if available else "cpu") 
device_count = torch.cuda.device_count() 
device_name =  torch.cuda.get_device_name(0)

print(f'Cuda available: {available}')
print(f'Current device: {curr_device}')
print(f'Device: {device}')
print(f'Device count: {device_count}')
print(f'Device name: {device_name}')


Cuda available: True
Current device: 0
Device: cuda:0
Device count: 1
Device name: NVIDIA GeForce RTX 3090


In [3]:
a = torch.rand(2, 2, 4) * 2
a

tensor([[[1.2043, 0.2169, 0.5543, 1.4761],
         [0.8260, 1.1556, 1.0174, 1.5874]],

        [[1.8310, 1.7862, 0.7170, 0.3091],
         [1.5134, 1.2704, 0.1828, 0.5689]]])

In [4]:
SelfAttention(4, 2)(a)

tensor([[[-0.3218, -0.3240,  0.0983,  0.0544],
         [-0.3228, -0.3258,  0.0981,  0.0494]],

        [[-0.3574, -0.2369, -0.2491, -0.1151],
         [-0.3571, -0.2377, -0.2487, -0.1162]]], grad_fn=<ViewBackward0>)

In [5]:
TransformerBlock(4, 2, 8)(a)

tensor([[[ 1.0629, -1.5632, -0.1381,  0.6384],
         [-0.4127, -1.2894,  0.2517,  1.4504]],

        [[ 1.0825,  0.2708,  0.2823, -1.6356],
         [ 1.7107, -0.6735, -0.7212, -0.3160]]],
       grad_fn=<NativeLayerNormBackward0>)

In [6]:
from pytorch_lightning import LightningModule, Trainer, seed_everything
from src.dataloaders import GLUEDataModule

seed_everything(42)
dm = GLUEDataModule(model_name_or_path='bert-large-uncased', 
                    task_name='cola',
                    train_batch_size=32,
                    eval_batch_size=32)
dm.setup('fit')

  warn(f"Failed to load image Python extension: {e}")
  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 42
Found cached dataset glue (/home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1226.64it/s]
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1afe93c2c61c7931.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1f28bd522d35d185.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a8f0d4e0f4309ddf.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f7

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim


class TestModel(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.model = VanillaBert(**kwargs)
        self.training_step_outputs = []

        
    def forward(self, **kwargs):
        return self.model(kwargs['input_ids'])
    
    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = F.binary_cross_entropy_with_logits(outputs, batch['labels'].float())
        self.training_step_outputs.append(loss)
        return loss
    
    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('train_loss', loss, prog_bar=True)
    
    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss = F.binary_cross_entropy_with_logits(outputs, batch['labels'].float())
        return {'loss': val_loss}

    def validation_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('val_loss', loss, prog_bar=True)
    
    def configure_optimizers(self):
        '''Prepare optimizer and schedule (linear warmup and decay)'''
        optimizer = optim.Adam(self.model.parameters(), lr=0.0001, betas=(0.9, 0.999),)
        return optimizer

In [9]:
from pytorch_lightning import LightningModule, Trainer, seed_everything

seed_everything(42)

model = TestModel(vocab_size=dm.tokenizer.vocab_size, n_blocks=1)

# BK: Using val_loss to pick best model for simplicity here
trainer = Trainer(
    max_epochs=5,
    accelerator='auto',
    devices=1 if torch.cuda.is_available() else None,
)
trainer.fit(model, datamodule=dm)

Global seed set to 42


TypeError: VanillaBert() got an unexpected keyword argument 'n_layers'