In [5]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

from torch import optim, nn, utils, Tensor
from torchmetrics.classification import BinaryAccuracy
import pytorch_lightning as pl
from transformers import AutoTokenizer
from pytorch_lightning import LightningDataModule

from datetime import datetime
from typing import Optional

import datasets
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
from torch.utils.data import DataLoader

In [6]:
available = torch.cuda.is_available()
curr_device = torch.cuda.current_device()
device = torch.device("cuda:0" if available else "cpu") 
device_count = torch.cuda.device_count() 
device_name =  torch.cuda.get_device_name(0)

print(f'Cuda available: {available}')
print(f'Current device: {curr_device}')
print(f'Device: {device}')
print(f'Device count: {device_count}')
print(f'Device name: {device_name}')


Cuda available: True
Current device: 0
Device: cuda:0
Device count: 1
Device name: NVIDIA GeForce RTX 3090


In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# returns {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [10]:
tokenizer.encode_plus('[SEP]', 
                      add_special_tokens=True,
                      padding='max_length',
                      truncation=True,
                      max_length=10,
                      return_token_type_ids=False,
                      return_overflowing_tokens=True,
                      return_length=True,
                      )

{'input_ids': [[101, 102, 102, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], 'length': [10], 'overflow_to_sample_mapping': [0]}

In [9]:
a = torch.Tensor([0.25, 0.25, 0.25, 0.25])
b = (1.0 - torch.Tensor([1, 1, 0, 0])) * -10000
c = a + b
nn.functional.softmax(c)

  nn.functional.softmax(c)


tensor([0.5000, 0.5000, 0.0000, 0.0000])

In [228]:
import json

wikifile = '/home/bjlkeng/devel/cramming/data/wikipedia/wikipedia-en-0.json'

with open(wikifile, 'r') as f:
  data = json.load(f)

In [234]:
dir(data[0])

603315

In [21]:
from datasets import load_dataset

filelist = '/home/bjlkeng/devel/cramming/data/files.csv'
with open(filelist, "r") as f:
    files = [prefix + fname.strip() for fname in f.readlines()]

data_dir = '/home/bjlkeng/devel/cramming/data/'
dataset = load_dataset('text', data_dir=data_dir, sample_by='document', streaming=True)
#except Exception as e:
#    print(e, '\n', i, files[i*batch:(i+1)*batch])

EmptyDatasetError: The directory at /home/bjlkeng/devel/cramming/data/**.txt doesn't contain any data files

In [17]:
with open(files[287], 'r') as f:
    print(f.readlines()[:10])

['\n', 'MAYAN DECEMBER\n', '\n', 'Brenda Cooper\n', '\n', 'To Russell Patrick Cooper  \n', 'Good journeys, little brother\n', '\n', 'Copyright © 2011 by Brenda Cooper.\n', '\n']


In [14]:
def tokenize(examples):
    input_ids = []
    for example in examples['text']:
        tokens = tokenizer.encode_plus(example, 
                                       add_special_tokens=True,
                                       padding='max_length',
                                       truncation=True,
                                       max_length=128,
                                       return_token_type_ids=False,
                                       return_overflowing_tokens=True,)
        input_ids += tokens['input_ids']
    return {'input_ids': input_ids}

dataset2 = dataset.map(tokenize, batched=True, remove_columns=['text'], batch_size=1000)
dataset2

{'train': <datasets.iterable_dataset.IterableDataset at 0x7f6abd906d40>}

In [16]:
#dataset2['train'][0]['input_ids']

batch = list(dataset2['train'])

In [18]:
print(len(batch))
batch[0]

39448


{'input_ids': [101,
  4067,
  2017,
  2005,
  8816,
  2075,
  2023,
  4079,
  1004,
  24253,
  26885,
  1012,
  1008,
  1008,
  1008,
  2131,
  1037,
  2489,
  26885,
  2043,
  2017,
  3693,
  2256,
  5653,
  2075,
  2862,
  1012,
  4606,
  1010,
  2131,
  14409,
  2006,
  2047,
  7085,
  1010,
  9144,
  1010,
  6749,
  9631,
  1010,
  1998,
  2062,
  2013,
  4079,
  1004,
  24253,
  1012,
  11562,
  2917,
  2000,
  3696,
  2039,
  1998,
  2156,
  3408,
  1998,
  3785,
  1012,
  11562,
  2182,
  2000,
  3696,
  2039,
  2525,
  1037,
  4942,
  29234,
  2099,
  1029,
  3073,
  2115,
  10373,
  2153,
  2061,
  2057,
  2064,
  4236,
  2023,
  26885,
  1998,
  4604,
  2017,
  2062,
  1997,
  2054,
  2017,
  2066,
  2000,
  3191,
  1012,
  2017,
  2097,
  3613,
  2000,
  4374,
  7262,
  4107,
  1999,
  2115,
  1999,
  8758,
  1012,
  1001,
  1001,
  8417,
  4958,
  8004,
  24342,
  3166,
  1005,
  1055,
  3602,
  18877,
  2190,
  7226,
  3127,
  1015,
  6315,
  3430,
  3127,
  1016,
  28618,