In [1]:
import numpy as np
import torch
from src.dataloaders import GLUEDataModule
from sentence_transformers import SentenceTransformer

from torch import optim, nn, utils, Tensor
from torchmetrics.classification import BinaryAccuracy
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification
from pytorch_lightning import LightningDataModule

from datetime import datetime
from typing import Optional

import datasets
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
available = torch.cuda.is_available()
curr_device = torch.cuda.current_device()
device = torch.device("cuda:0" if available else "cpu") 
device_count = torch.cuda.device_count() 
device_name =  torch.cuda.get_device_name(0)

print(f'Cuda available: {available}')
print(f'Current device: {curr_device}')
print(f'Device: {device}')
print(f'Device count: {device_count}')
print(f'Device name: {device_name}')


Cuda available: True
Current device: 0
Device: cuda:0
Device count: 1
Device name: NVIDIA GeForce RTX 3090


In [3]:
from src.dataloaders import BertDataModule

files = [
'/home/bjlkeng/devel/cramming/data/data/pg19_train/*',
#'/home/bjlkeng/devel/cramming/data/data/wikipedia/flat/*',
#'/home/bjlkeng/devel/cramming/data/data/books3/books3/the-eye.eu/public/Books/Bibliotik/**',
]

dm = BertDataModule(source_files=files, tokenizer_name='bert-base-uncased', max_seq_length=128)
dm.setup('fit')

Resolving data files: 100%|██████████| 28602/28602 [00:03<00:00, 8734.19it/s]  


In [4]:
dataloader = dm.train_dataloader()
d = next(iter(dataloader))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [5]:
print(d['attention_mask'].shape)
d['attention_mask'][:5, :]

torch.Size([32, 128])


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [6]:
print(d['input_ids'].shape)
d['input_ids'][:5, :]

torch.Size([32, 128])


tensor([[  101,   103,  2214,  9025,  1997,  1996,  2332,  2508,  2544,  1997,
          1996,  6331,  1996,  2034, 27513,  1997,  9952,  1024,  2170, 11046,
          1015,  1024,   103,   103,  1996,  2927,  2643,  2580,  1996,  6014,
          1998,  1996,  3011,   103,  1015,  1024,  1016,  1998,  1996,  3011,
          2001,  2302,  2433,  1010,  1998, 11675,  1025,  1998,   103,  2001,
          2588,  1996,  2227,  1997,  1996,  2784,  1012,  1998,  1996,  4382,
          1997,  2643,  2333,  2588,  1996,  2227,  1997,  1996,   103,  1012,
          1015,  1024,  1017,  1998,  2643,  2056,  1010,  2292,   103,  2022,
           103,  1024,  1998,   103,  2001,  2422,  1012,  1015,  1024,  1018,
          1998,  2643,  2387,  1996,  2422,  1010,  2008,  2009,  2001,  2204,
          1024,  1998,  2643,  4055,  1996,   103,   103,  1996,   103,  1012,
          1015,  1024,  1019,  1998,  2643, 10464,  1996,  2422,  2154,  1010,
          1998,  1996,  4768,   103,  2170,  2305,  

In [7]:
print(d['labels'].shape)
d['labels'][:5, :]

torch.Size([32, 128])


tensor([[ -100,  1996,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  2338,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  1015,  1999,  -100,  -100,  -100,  -100,  1996,  -100,
          -100,  -100,  -100,  1012,  -100,  -100,  1016,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  4768,  -100,
          2588,  1996,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  5380,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  2045,  -100,
          2422,  -100,  -100,  2045,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  2422,  2013,  -100,  4768,  -100,
          -100,  -100,  -100,  -100,  -100,  2170,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  2002,  -100,  -100,  

In [8]:
N = 10000 - 1
import time
start = time.time()
for i, d in enumerate(dataloader):
    if i >= N:
        break
end = time.time()
print(f'{N} batches in {end-start:.2f} seconds ({N/(end-start):.2f} batches/s)')

9999 batches in 83.19 seconds (120.19 batches/s)
