In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from src.models import SelfAttention, TransformerBlock

In [2]:
available = torch.cuda.is_available()
curr_device = torch.cuda.current_device()
device = torch.device("cuda:0" if available else "cpu") 
device_count = torch.cuda.device_count() 
device_name =  torch.cuda.get_device_name(0)

print(f'Cuda available: {available}')
print(f'Current device: {curr_device}')
print(f'Device: {device}')
print(f'Device count: {device_count}')
print(f'Device name: {device_name}')


Cuda available: True
Current device: 0
Device: cuda:0
Device count: 1
Device name: NVIDIA GeForce RTX 3090


In [3]:
a = torch.rand(2, 2, 4) * 2
a

tensor([[[0.9147, 1.4812, 1.2380, 1.6248],
         [0.5050, 1.8656, 0.1772, 0.9409]],

        [[1.2372, 1.0566, 0.9481, 1.8722],
         [1.7367, 0.7827, 1.7367, 1.1785]]])

In [4]:
SelfAttention(4, 2)(a)

tensor([[[-0.3952,  0.6203, -0.0451,  0.1298],
         [-0.3948,  0.6122, -0.0486,  0.1234]],

        [[-0.4051,  0.5366, -0.1203,  0.0498],
         [-0.4057,  0.5405, -0.1191,  0.0528]]], grad_fn=<ViewBackward0>)

In [5]:
TransformerBlock(4, 2, 8)(a)

tensor([[[ 1.6598, -0.8952, -0.6573, -0.1074],
         [ 0.2433,  1.5264, -0.7353, -1.0344]],

        [[ 1.4367,  0.0652, -1.3849, -0.1169],
         [ 1.6778, -0.1542, -0.7494, -0.7743]]],
       grad_fn=<NativeLayerNormBackward0>)

In [6]:
from pytorch_lightning import LightningModule, Trainer, seed_everything
from src.dataloaders import GLUEDataModule

seed_everything(42)
dm = GLUEDataModule(model_name_or_path='bert-large-uncased', 
                    task_name='cola',
                    train_batch_size=32,
                    eval_batch_size=32)
dm.setup('fit')

  warn(f"Failed to load image Python extension: {e}")
  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 42
Found cached dataset glue (/home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 358.93it/s]
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1afe93c2c61c7931.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1f28bd522d35d185.arrow
Loading cached processed dataset at /home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a8f0d4e0f4309ddf.arrow
                                                    

In [45]:
import torch
import torch.nn as nn
import torch.optim as optim

class TransformerModule(nn.Module):
    def __init__(self, vocab_size, n_blocks, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, d_model)
        self.layer_norm = nn.LayerNorm(d_model)
        self.transformer_blocks = nn.Sequential(
            *[TransformerBlock(d_model, n_heads, d_ff, dropout=dropout) for _ in range(n_blocks)]
        )
        self.fc = nn.Linear(d_model, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded_text = self.embeddings(text)
        embedded_text = self.layer_norm(embedded_text)
        transformer_output = self.transformer_blocks(embedded_text)
        pooled_output = transformer_output.mean(axis=1)
        logits = self.fc(pooled_output)
        return logits.squeeze(-1)


class TestModel(LightningModule):
    def __init__(self, vocab_size, n_blocks, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.model = TransformerModule(vocab_size, n_blocks, d_model, n_heads, d_ff, dropout=dropout)
        self.training_step_outputs = []

        
    def forward(self, **kwargs):
        return self.model(kwargs['input_ids'])
    
    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = F.binary_cross_entropy_with_logits(outputs, batch['labels'].float())
        self.training_step_outputs.append(loss)
        return loss
    
    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('train_loss', loss, prog_bar=True)
    
    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss = F.binary_cross_entropy_with_logits(outputs, batch['labels'].float())
        return {'loss': val_loss}

    def validation_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('val_loss', loss, prog_bar=True)
    
    def configure_optimizers(self):
        '''Prepare optimizer and schedule (linear warmup and decay)'''
        optimizer = optim.Adam(self.model.parameters(), lr=0.0001, betas=(0.9, 0.999),)
        return optimizer

In [47]:
from pytorch_lightning import LightningModule, Trainer, seed_everything

seed_everything(42)

model = TestModel(
    vocab_size=dm.tokenizer.vocab_size, 
    n_blocks=12,
    d_model=768,
    n_heads=12,
    d_ff=768*4,
    dropout=0.0
 )

# BK: Using val_loss to pick best model for simplicity here
trainer = Trainer(
    max_epochs=5,
    accelerator='auto',
    devices=1 if torch.cuda.is_available() else None,
)
trainer.fit(model, datamodule=dm)

Global seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Found cached dataset glue (/home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1065.54it/s]
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Found cached dataset glue (/home/bjlkeng/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1142.76it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type     

Epoch 4: 100%|██████████| 301/301 [00:14<00:00, 20.36it/s, loss=0.554, v_num=25, val_loss=0.716, train_loss=0.523]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 301/301 [00:15<00:00, 19.53it/s, loss=0.554, v_num=25, val_loss=0.716, train_loss=0.523]
