In [1]:
from datasets import load_dataset

In [2]:
raw_datasets = load_dataset("glue", "mnli")

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [4]:
from transformers import AutoTokenizer

In [5]:
fast_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [10]:
def tokenize_with_fast(examples):
    return fast_tokenizer(
        examples["premise"], examples["hypothesis"], truncation=True
    )

In [11]:
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)

In [12]:
def tokenize_with_slow(examples):
    return slow_tokenizer(examples["premise"], examples["hypothesis"], truncation=True)

In [13]:
%time tokenized_datasets = raw_datasets.map(tokenize_with_fast)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

CPU times: user 4min 50s, sys: 7.43 s, total: 4min 57s
Wall time: 5min 43s


In [14]:
%time tokenized_datasets = raw_datasets.map(tokenize_with_slow)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

CPU times: user 8min 30s, sys: 15.6 s, total: 8min 45s
Wall time: 9min 10s


In [15]:
%time tokenized_datasets = raw_datasets.map(tokenize_with_fast, batched=True)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

CPU times: user 3min 33s, sys: 1.67 s, total: 3min 34s
Wall time: 1min 16s


In [16]:
%time tokenized_datasets = raw_datasets.map(tokenize_with_slow, batched=True)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

CPU times: user 5min 25s, sys: 1.91 s, total: 5min 27s
Wall time: 5min 30s


In [17]:
from transformers import AutoTokenizer

In [33]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Anubhav and I live in Seattle"
encoding = tokenizer(example)
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [34]:
tokenizer.is_fast

True

In [35]:
encoding.is_fast

True

In [36]:
encoding.tokens()

['[CLS]',
 'My',
 'name',
 'is',
 'An',
 '##ub',
 '##ha',
 '##v',
 'and',
 'I',
 'live',
 'in',
 'Seattle',
 '[SEP]']

In [37]:
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, None]

In [45]:
example2 = "81s"

In [46]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
encoding = tokenizer(example2)

In [47]:
encoding.tokens()

['[CLS]', '81', '##s', '[SEP]']

In [48]:
encoding.word_ids()

[None, 0, 0, None]

In [49]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
encoding = tokenizer(example2)

In [50]:
encoding.tokens()

['<s>', '81', 's', '</s>']

In [51]:
encoding.word_ids()

[None, 0, 1, None]

In [52]:
start, end = encoding.word_to_chars(1)

In [53]:
example2[start:end]

's'

In [65]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [66]:
example1 = "I am in Seattle"
example2 = "I am Anubhav" 

In [67]:
encoding = tokenizer(example1, example2)

In [68]:
encoding

{'input_ids': [101, 146, 1821, 1107, 5160, 102, 146, 1821, 1760, 10354, 2328, 1964, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [69]:
encoding.tokens()

['[CLS]',
 'I',
 'am',
 'in',
 'Seattle',
 '[SEP]',
 'I',
 'am',
 'An',
 '##ub',
 '##ha',
 '##v',
 '[SEP]']

In [70]:
encoding.word_ids()

[None, 0, 1, 2, 3, None, 0, 1, 2, 2, 2, 2, None]

In [71]:
start, end = encoding.word_to_chars(3)

In [72]:
start, end

(8, 15)

In [73]:
example1[start: end]

'Seattle'

In [75]:
encoding

<bound method BatchEncoding.sequence_ids of {'input_ids': [101, 146, 1821, 1107, 5160, 102, 146, 1821, 1760, 10354, 2328, 1964, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}>