In [3]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

from torch import optim, nn, utils, Tensor
from torchmetrics.classification import BinaryAccuracy
import pytorch_lightning as pl
from transformers import AutoTokenizer
from pytorch_lightning import LightningDataModule

from datetime import datetime
from typing import Optional

import datasets
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
available = torch.cuda.is_available()
curr_device = torch.cuda.current_device()
device = torch.device("cuda:0" if available else "cpu") 
device_count = torch.cuda.device_count() 
device_name =  torch.cuda.get_device_name(0)

print(f'Cuda available: {available}')
print(f'Current device: {curr_device}')
print(f'Device: {device}')
print(f'Device count: {device_count}')
print(f'Device name: {device_name}')


Cuda available: True
Current device: 0
Device: cuda:0
Device count: 1
Device name: NVIDIA GeForce RTX 3090


In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# returns {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [48]:
tokenizer.encode_plus('This is a very long sentence that Im writing now[SEP]', 
                      add_special_tokens=True,
                      padding='max_length',
                      truncation=True,
                      max_length=10,
                      return_token_type_ids=False,
                      return_overflowing_tokens=True,
                      return_length=True,
                      )

{'input_ids': [[101, 2023, 2003, 1037, 2200, 2146, 6251, 2008, 10047, 102], [101, 3015, 2085, 102, 102, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], 'length': [10, 10], 'overflow_to_sample_mapping': [0, 0]}

In [7]:
a = torch.Tensor([0.25, 0.25, 0.25, 0.25])
b = (1.0 - torch.Tensor([1, 1, 0, 0])) * -10000
c = a + b
nn.functional.softmax(c)

  nn.functional.softmax(c)


tensor([0.5000, 0.5000, 0.0000, 0.0000])

In [9]:
import json

#wikifile = '/home/bjlkeng/devel/cramming/data/wikipedia/wikipedia-en-0.json'
#
#with open(wikifile, 'r') as f:
#  data = json.load(f)

In [11]:
#dir(data[0])

In [12]:
from datasets import load_dataset

#filelist = '/home/bjlkeng/devel/cramming/data/files.csv'
#with open(filelist, "r") as f:
#    files = [prefix + fname.strip() for fname in f.readlines()]

files = [
'/home/bjlkeng/devel/cramming/data/data/pg19_train/*',
'/home/bjlkeng/devel/cramming/data/data/wikipedia/flat/*',
'/home/bjlkeng/devel/cramming/data/data/books3/books3/the-eye.eu/public/Books/Bibliotik/**',
]
dataset = load_dataset('text', data_files=files, sample_by='document', 
                       split='train', streaming=True)
#except Exception as e:
#    print(e, '\n', i, files[i*batch:(i+1)*batch])

Resolving data files: 100%|██████████| 255412/255412 [00:00<00:00, 367060.61it/s]


In [13]:
#sum([len(row['text']) for row in dataset])
#sum([1 for row in dataset])
dataset

<datasets.iterable_dataset.IterableDataset at 0x7f2bed1fcfd0>

In [148]:
def tokenize(examples):
    input_ids = []
    attention_masks = []
    for example in examples['text']:
        tokens = tokenizer(example,
                           add_special_tokens=True,
                           padding='max_length',
                           truncation=True,
                           max_length=10,
                           return_token_type_ids=False,
                           return_overflowing_tokens=True,)
        input_ids += tokens['input_ids']
        attention_masks += tokens['attention_mask']
    return {'input_ids': input_ids, 'attention_mask': attention_masks}

def batch_examples(examples):
    print("A", len(examples['input_ids'][0]))
    return {'input_ids': [examples['input_ids']],
            'attention_mask': [examples['attention_mask']]}

dataset2 = dataset.map(tokenize, batched=True, remove_columns=['text'], batch_size=1)
dataset3 = dataset2.map(batch_examples, batched=True, batch_size=10)
dataset3

<datasets.iterable_dataset.IterableDataset at 0x7f2c245710f0>

In [149]:
x = next(iter(dataset3))
print(x.keys(), len(x['input_ids']), len(x['attention_mask']))
print(x)

A 10
dict_keys(['input_ids', 'attention_mask']) 10 10
{'input_ids': [[101, 1996, 2214, 9025, 1997, 1996, 2332, 2508, 2544, 102], [101, 1997, 1996, 6331, 1996, 2034, 2338, 1997, 9952, 102], [101, 1024, 2170, 11046, 1015, 1024, 1015, 1999, 1996, 102], [101, 2927, 2643, 2580, 1996, 6014, 1998, 1996, 3011, 102], [101, 1012, 1015, 1024, 1016, 1998, 1996, 3011, 2001, 102], [101, 2302, 2433, 1010, 1998, 11675, 1025, 1998, 4768, 102], [101, 2001, 2588, 1996, 2227, 1997, 1996, 2784, 1012, 102], [101, 1998, 1996, 4382, 1997, 2643, 2333, 2588, 1996, 102], [101, 2227, 1997, 1996, 5380, 1012, 1015, 1024, 1017, 102], [101, 1998, 2643, 2056, 1010, 2292, 2045, 2022, 2422, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1,