In [1]:
# split on words 
tokenized_text = "My name is asif faisal shagoto chowdhury".split()
tokenized_text, len(tokenized_text)

(['My', 'name', 'is', 'asif', 'faisal', 'shagoto', 'chowdhury'], 7)

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 49.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 401 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

PreTrainedTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
# same with AutoTokenizer

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
tokenizer("Using transformers model is easy!")

{'input_ids': [101, 7993, 11303, 1468, 2235, 1110, 3123, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
tokenizer.save_pretrained("my-bert-tokenizer")

('my-bert-tokenizer/tokenizer_config.json',
 'my-bert-tokenizer/special_tokens_map.json',
 'my-bert-tokenizer/vocab.txt',
 'my-bert-tokenizer/added_tokens.json',
 'my-bert-tokenizer/tokenizer.json')

# The `Tokenization` Pipeline (*encoding*)

In [7]:
# 1. import tokenenizer
from transformers import AutoTokenizer
#2. instantiate AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
# 3. get the token from the inputs
raw_input_text = "Let's try to tokenize!" 

tokens = tokenizer.tokenize(raw_input_text)
tokens

['Let', "'", 's', 'try', 'to', 'token', '##ize', '!']

In [9]:
# 4. convert tokens to input ids

input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

[2421, 112, 188, 2222, 1106, 22559, 3708, 106]

In [10]:
# 5. add special token to get the input ready for the model
final_inputs = tokenizer.prepare_for_model(input_ids)
final_inputs['input_ids']

[101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102]

In [11]:
final_inputs

{'input_ids': [101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
final_input_text = tokenizer.decode(final_inputs["input_ids"])
final_input_text

"[CLS] Let's try to tokenize! [SEP]"

not all the tokenizer use the same special token

In [13]:
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
inputs = roberta_tokenizer("Let's try to tokenize!")

inputs['input_ids']

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

[0, 7939, 18, 860, 7, 19233, 2072, 328, 2]

In [14]:
real_inputs = roberta_tokenizer.decode(inputs["input_ids"])
real_inputs

"<s>Let's try to tokenize!</s>"

In [15]:
# all of this in one line 

raw_texts = "Let's try to tokenize!"
single_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
final_inputs_single = single_tokenizer(raw_texts)
final_inputs_single["input_ids"], final_inputs["input_ids"]

([101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102],
 [101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102])

In [16]:
final_inputs_single["input_ids"]  == final_inputs["input_ids"]

True

# Multiple sequence (inputs)

In [17]:
import torch

from transformers import AutoTokenizer , AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I want to play badminton tonight very badly!"

tokens = tokenizer.tokenize(sequence)
tokens

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

['i', 'want', 'to', 'play', 'badminton', 'tonight', 'very', 'badly', '!']

In [18]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[1045, 2215, 2000, 2377, 14618, 3892, 2200, 6649, 999]

In [19]:
type(ids)

list

In [20]:
input_ids = torch.tensor(ids)
input_ids

tensor([ 1045,  2215,  2000,  2377, 14618,  3892,  2200,  6649,   999])

In [21]:
torch.tensor([ids]) # this should have been passed as transformer library expects the inputs to be multiple

tensor([[ 1045,  2215,  2000,  2377, 14618,  3892,  2200,  6649,   999]])

In [22]:
text_decoded = tokenizer.decode(input_ids)
text_decoded

'i want to play badminton tonight very badly!'

In [23]:
# this will lead to an error
model(input_ids)

IndexError: ignored

In [None]:
model.config


In [None]:
tokenized_inputs = tokenizer(sequence,return_tensors= 'pt')
tokenized_inputs, input_ids

In [None]:
# trying out correctly 

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "Tonight , I want to play badminton!"

tokens = tokenizer.tokenize(sequence)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
input = torch.tensor([ids])
logits = model(input).logits
print(logits)


In [None]:
torch.nn.functional.softmax(logits, dim= -1)

In [None]:
model.config.label2id

In [None]:
model.config

In [None]:
ids

In [None]:
batched_ids = torch.tensor( [ids,ids])
batched_ids

In [None]:

output = model(torch.tensor(batched_ids))
print(output.logits)

In [None]:
prediction = torch.nn.functional.softmax(output.logits, dim = -1)
prediction

In [None]:
# padding
tokenizer.pad_token_id

In [None]:
# trying with batched ids 

torch.nn.functional.softmax(model(torch.tensor([ids,ids])).logits, dim = -1)