In [1]:
from transformers import AutoTokenizer

print("------Downloading Tokenizer------")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
print("------Downloaded Tokenizer------")

text = "The quick brown fox jumped over the fence"
tokens = tokenizer(text).input_ids

print(tokens)

  from .autonotebook import tqdm as notebook_tqdm


------Downloading Tokenizer------
------Downloaded Tokenizer------
[0, 133, 2119, 6219, 23602, 4262, 81, 5, 8146, 2]


In [2]:
from datasets import load_dataset

wiki2 = load_dataset('Salesforce/wikitext', 'wikitext-2-raw-v1')

Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 87664.63 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 546971.73 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 323651.84 examples/s]


In [19]:
def process(batch):
    # tokenizer returns a dict with 'input_ids', 'attention_mask', etc.
    tokens = tokenizer(batch['text'], padding=False, truncation=False)
    input_ids = tokens['input_ids']  # this is a list of lists
    return {
        'token_ids': input_ids,
        'len': [len(t) for t in input_ids]
    }

In [20]:
print(process(text))

TypeError: string indices must be integers, not 'str'

In [10]:
wiki2['train'].features

{'text': Value('string')}

In [21]:
wiki2 = wiki2.map(
    process,
    batched=True,
    remove_columns=['text']
)

Map:  46%|████▌     | 2000/4358 [00:00<00:00, 11865.61 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 4358/4358 [00:00<00:00, 12974.19 examples/s]
Map: 100%|██████████| 36718/36718 [00:03<00:00, 11059.48 examples/s]
Map: 100%|██████████| 3760/3760 [00:00<00:00, 10176.89 examples/s]


In [25]:
wiki2

DatasetDict({
    test: Dataset({
        features: ['token_ids', 'len'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['token_ids', 'len'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['token_ids', 'len'],
        num_rows: 3760
    })
})

In [35]:
import numpy as np
from tqdm import tqdm


for split, data in wiki2.items():
    tensor_length = tensor_length = sum(len(x) for x in data['token_ids'])
    filename = f"data/{split}.tokens"
    memmap_file = np.memmap(filename=filename, dtype=np.uint16, mode='w+', shape=(tensor_length,))
    idx = 0
    for batch_idx in tqdm(range(64), desc=f"Writing {filename}"):
        batch = data.shard(64, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['token_ids'])
        memmap_file[idx:idx+len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    memmap_file.flush()


Writing data/test.tokens: 100%|██████████| 64/64 [00:02<00:00, 29.06it/s]
Writing data/train.tokens: 100%|██████████| 64/64 [00:15<00:00,  4.14it/s]
Writing data/validation.tokens: 100%|██████████| 64/64 [00:01<00:00, 35.11it/s]


/home/tororo.in/Desktop/projects/bert
