#For Fine-Tuning, AutoTokenizer concepts are very IMPORTANT

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

input_text = "Lets try to Tokenize!"

print("Input Text: ")
print(input_text)
print()

#Tokenize Input
tokens = tokenizer(input_text)
print("Tokenzied Output:")
print(tokens)
print()

#Decoding input_ids back to words
print("Decoded Text Output:")
print(tokenizer.decode(tokens["input_ids"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Input Text: 
Lets try to Tokenize!

Tokenzied Output:
{'input_ids': [101, 11082, 3046, 2000, 19204, 4697, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

Decoded Text Output:
[CLS] lets try to tokenize! [SEP]


**input_ids**: These are Integer IDs of tokens from the tokenizers vocabulary. Each ID maps to a word or subword. The input ids are often the only required parameters to be passed to the model as an input.

**attention_mask**: It tells the model which tokens to attend to (1) and which to ignore (0 for padding).

**token_type_ids:** It identifies which segment a token belongs to when there is more than one sequence.


**Special Token Properties:** Tokenizers automatically add special tokens like CLS, SEP, PAD during tokenization. These properties store the string representation of these tokens and their corresponsing numerical iDs in the vocabulary.

In [3]:
#comparing two sequendces
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

sequence_a = "HuggingFace is based in NYC"

sequence_b = "Where is HuggingFace based?"

encoded_dict = tokenizer(sequence_a, sequence_b)

decoded = tokenizer.decode(encoded_dict["input_ids"])

print("token_type_ids:")
print(encoded_dict["token_type_ids"])
print()

#Decoding input_ids back to words
print("Decoded Text Output:")
print(decoded)

token_type_ids:
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

Decoded Text Output:
[CLS] huggingface is based in nyc [SEP] where is huggingface based? [SEP]


#return_tensor Argument
Finally, you want the tokenizer to return the actual tensors that get fed to the model.

In order to that we can set then return_tensors parameter to either 'pt' for pytorch or 'tf' for tensorflow. It can also support n-dimension array('np': Return Numpy np.ndarray objects).

Tensor: A tensor is similar to multi dimensional array.

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

input_text = "Lets try to tokenize!"
print("Input Text: ")
print(input_text)

Input Text: 
Lets try to tokenize!


In [5]:
encoded_input = tokenizer(input_text)

print("Input IDs")
print(encoded_input["input_ids"])
print()

print("output Type: ")
print(type(encoded_input["input_ids"]))

Input IDs
[101, 11082, 3046, 2000, 19204, 4697, 999, 102]

output Type: 
<class 'list'>


In [9]:
encoded_input = tokenizer(input_text, return_tensors = "pt")

print("Input Ids")
print(encoded_input["input_ids"])
print()

print("output type: ")
print(type(encoded_input["input_ids"]))

Input Ids
tensor([[  101, 11082,  3046,  2000, 19204,  4697,   999,   102]])

output type: 
<class 'torch.Tensor'>


In [11]:
encoded_input = tokenizer(input_text, return_tensors = "np")

print("Input Ids")
print(encoded_input["input_ids"])
print()

print("output type: ")
print(type(encoded_input["input_ids"]))

Input Ids
[[  101 11082  3046  2000 19204  4697   999   102]]

output type: 
<class 'numpy.ndarray'>


In [12]:
encoded_input = tokenizer(input_text, return_tensors = "tf")

print("Input Ids")
print(encoded_input["input_ids"])
print()

print("output type: ")
print(type(encoded_input["input_ids"]))

Input Ids
tf.Tensor([[  101 11082  3046  2000 19204  4697   999   102]], shape=(1, 8), dtype=int32)

output type: 
<class 'tensorflow.python.framework.ops.EagerTensor'>


#PROPERITES mostly suited for BERT based models

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

print(f"Model MAX Length: {tokenizer.model_max_length}")
print(f"Vocabularu Size: {tokenizer.vocab_size}")
print(f"Is FAST Tokenizer: {tokenizer.is_fast}")
print(f"Padding side: {tokenizer.padding_side}")
print(f"Truncation side: {tokenizer.truncation_side}")
print(f"Model Input Names: {tokenizer.model_input_names}")

print("\n----SPECIAL TOKENS----")
print(f"[CLS] Token: '{tokenizer.cls_token}' (ID: {tokenizer.cls_token_id})")
print(f"[SEP] Token: '{tokenizer.sep_token}' (ID:{tokenizer.sep_token_id})")
print(f"[PAD] Token: '{tokenizer.pad_token}' (ID:{tokenizer.pad_token_id})")
print(f"[UNK] Token: '{tokenizer.unk_token}' (ID:{tokenizer.unk_token_id})")
print(f"[MASK] Token: '{tokenizer.mask_token}' (ID:{tokenizer.mask_token_id})")

Model MAX Length: 512
Vocabularu Size: 30522
Is FAST Tokenizer: True
Padding side: right
Truncation side: right
Model Input Names: ['input_ids', 'token_type_ids', 'attention_mask']

----SPECIAL TOKENS----
[CLS] Token: '[CLS]' (ID: 101)
[SEP] Token: '[SEP]' (ID:102)
[PAD] Token: '[PAD]' (ID:0)
[UNK] Token: '[UNK]' (ID:100)
[MASK] Token: '[MASK]' (ID:103)


#Inspecting Tokenization Step by Step

STEP1: split input_text to tokens

STEP2: Convert the tokens to numerical IDs

STEP3: Append special tokens the model expects

STEP4: Decoding input_ids back to words.

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

print(f"Tokenizer's default max_length (model_max_length): {tokenizer.model_max_length}")
print()

##The first step of the above pipeline is to split the text into tokens
tokens = tokenizer.tokenize(input_text)
print("Tokens:", tokens)

Tokenizer's default max_length (model_max_length): 512

Tokens:  ['lets', 'try', 'to', 'token', '##ize', '!']


In [19]:
#step2: Convert the tokens to numerical IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

print("Token ID:", input_ids)

Token ID: [11082, 3046, 2000, 19204, 4697, 999]


In [20]:
#step3: Lastly, the tokenizer adds special tokens the model expects
out = tokenizer.prepare_for_model(input_ids)
input_ids_with_special_tokens = out["input_ids"]
print("Token ID with special tokens:", input_ids_with_special_tokens)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Token ID with special tokens: [101, 11082, 3046, 2000, 19204, 4697, 999, 102]


In [21]:
#step4: Decode method allows us to check how the final output of the tokenizer translates back to text
print("Decode Text Output:",tokenizer.decode(input_ids_with_special_tokens))

Decode Text Output: [CLS] lets try to tokenize! [SEP]


#EXAMPLE AUTO TOKENIZER FOR "roberta-base" MODEL

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

print(f"Tokenizer's default max_length (model_max_length): {tokenizer.model_max_length}")
print()

##The first step of the above pipeline is to split the text into tokens
tokens = tokenizer.tokenize(input_text)
print("Tokens:", tokens)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer's default max_length (model_max_length): 512

Tokens: ['L', 'ets', 'Ġtry', 'Ġto', 'Ġtoken', 'ize', '!']


In [23]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)

print("Token ID:", input_ids)

Token ID: [574, 2580, 860, 7, 19233, 2072, 328]


In [24]:
#step3: Lastly, the tokenizer adds special tokens the model expects
out = tokenizer.prepare_for_model(input_ids)
input_ids_with_special_tokens = out["input_ids"]
print("Token ID with special tokens:", input_ids_with_special_tokens)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Token ID with special tokens: [0, 574, 2580, 860, 7, 19233, 2072, 328, 2]


In [25]:
print("Decode Text Output:",tokenizer.decode(input_ids_with_special_tokens))

Decode Text Output: <s>Lets try to tokenize!</s>


#Batching, Padding and Truncation
References: https://huggingface.co/docs/transformers/en/pad_truncation

These concepts are critical for preparing data efficiently for deep learning models, especially when dealing with variable-length sequences like text.

Batching: Rather than giving entire data at once we can give collection of multiple data(like samples.)

Padding: Process where all the input sequences length become equal.

Truncation: Opposite to padding. It means cutting data for shorter sequences.

In [28]:
#Example
batched_sentences = [
    "But What about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
    "This is a much longer sentence that will definitely need to be truncated.",
    "Another Sentence."
]

batched_sentences

['But What about second breakfast?',
 "Don't think he knows about second breakfast, Pip.",
 'What about elevensies?',
 'This is a much longer sentence that will definitely need to be truncated.',
 'Another Sentence.']

In [29]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

print(f"Tokenizer's default max_length (model_max_length): {tokenizer.model_max_length}")

Tokenizer's default max_length (model_max_length): 512


In [30]:
#padding

tokenized_inputs = tokenizer(batched_sentences, padding=True, return_tensors="pt")

#Checking input ids
print("\n---- Processed Inputs (Batch)----")
print(f"Input IDs:\n{tokenized_inputs['input_ids']}")
print(f"Shape of Input IDs: {tokenized_inputs['input_ids'].shape}")
#should be(batch_size, max_length)


---- Processed Inputs (Batch)----
Input IDs:
tensor([[  101,  2021,  2054,  2055,  2117,  6350,  1029,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2123,  1005,  1056,  2228,  2002,  4282,  2055,  2117,  6350,
          1010, 28315,  1012,   102,     0,     0],
        [  101,  2054,  2055,  5408, 14625,  1029,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2023,  2003,  1037,  2172,  2936,  6251,  2008,  2097,  5791,
          2342,  2000,  2022, 25449,  1012,   102],
        [  101,  2178,  6251,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]])
Shape of Input IDs: torch.Size([5, 16])


In [32]:
#Decoding each sequence to see the effect
print("\n----Decode Sequences----")
for i, input_ids in enumerate(tokenized_inputs['input_ids']):
  decoded_text = tokenizer.decode(input_ids)
  print(f"Sequence {i+1}: {decoded_text}")

print("\n---- UNDERSTANDING THE SHAPES----")
print(f"Batch Size (Number of Sentences): {tokenized_inputs['input_ids'].shape[0]}")
print(f"Sequence Length (after padding/truncation): {tokenized_inputs['input_ids'].shape[1]}")


----Decode Sequences----
Sequence 1: [CLS] but what about second breakfast? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Sequence 2: [CLS] don ' t think he knows about second breakfast, pip. [SEP] [PAD] [PAD]
Sequence 3: [CLS] what about elevensies? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Sequence 4: [CLS] this is a much longer sentence that will definitely need to be truncated. [SEP]
Sequence 5: [CLS] another sentence. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

---- UNDERSTANDING THE SHAPES----
Batch Size (Number of Sentences): 5
Sequence Length (after padding/truncation): 16


In [34]:
 #TRUNCATION

 try:
  tokenized_inputs = tokenizer(batched_sentences, truncation = True, return_tensors="pt")
except Exception as e:
  print(f"\nAn error occured: {e}")


An error occured: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).


#FIXING THE ABOVE ERROR

when batched_sentences is a list of multiple sentences, they will almost certainly have different lengths. For the tokenizer to combine them into a single, uniform tensor (a batch), all sequences must be of the same length.

*truncation = True*, makes sure no sentence is too long.

*padding = True*, makes sre no sentence is too short (it adds padding tokens).

In [39]:
 #TRUNCATION

 try:
  tokenized_inputs = tokenizer(batched_sentences, truncation = True, padding = True, return_tensors="pt")
except Exception as e:
  print(f"\nAn error occured: {e}")

#Lets check the input_ids
print("\n----Processed Inputs (Batch)----")
print(f"Input IDs:\n{tokenized_inputs['input_ids']}")
print(f"Shape of Input IDs: {tokenized_inputs['input_ids'].shape}")
#should be (batch_size, max_length)
#lets decode each sequence to see the effect
print("\n----Decoded Sequences----")
for i, input_ids in enumerate(tokenized_inputs['input_ids']):
  decoded_text = tokenizer.decode(input_ids)
  print(f"Sequence {i + 1}: {decoded_text}")

print("\n----Understanding the Shapes----")
print(f"Batch Size (Number of Sentences): {tokenized_inputs['input_ids'].shape[0]}")
print(f"Sequence Length (after padding/truncation): {tokenized_inputs['input_ids'].shape[1]}")


----Processed Inputs (Batch)----
Input IDs:
tensor([[  101,  2021,  2054,  2055,  2117,  6350,  1029,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2123,  1005,  1056,  2228,  2002,  4282,  2055,  2117,  6350,
          1010, 28315,  1012,   102,     0,     0],
        [  101,  2054,  2055,  5408, 14625,  1029,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2023,  2003,  1037,  2172,  2936,  6251,  2008,  2097,  5791,
          2342,  2000,  2022, 25449,  1012,   102],
        [  101,  2178,  6251,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]])
Shape of Input IDs: torch.Size([5, 16])

----Decoded Sequences----
Sequence 1: [CLS] but what about second breakfast? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Sequence 2: [CLS] don ' t think he knows about second breakfast, pip. [SEP] [PAD] [PAD]
Sequence 3: [CLS] what about elevensies

In [41]:
#we'll use a common BERT tokenizer for demonstration\
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
texts= [
    "But What about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
    "This is a much longer sentence that will definitely need to be truncated.",
    "Another Sentence."
]

texts

print(f"Tokenizer's default max_length (model_max_length): {tokenizer.model_max_length}")

Tokenizer's default max_length (model_max_length): 512


In [44]:
#Padding and Truncation to a custom max_length (most common for fixed input size)
print("\n---- Padding and Truncation to max_length=20 (most common)----")
inputs_combined = tokenizer(
    texts,
    padding='max_length',
    truncation=True,
    max_length=20,
    return_tensors="pt"
)

print("\nInput IDs (combined- all same length):")
for i, ids in enumerate(inputs_combined['input_ids']):
  print(f"Sequence {i + 1} Length: {len(ids)}. IDs: {ids}\n")
  print(f"Decoded: {tokenizer.decode(ids, skip_special_tokens= False)}\n")
print(f"\nFinal Tensor Shape:{inputs_combined['input_ids'].shape}")


---- Padding and Truncation to max_length=20 (most common)----

Input IDs (combined- all same length):
Sequence 1 Length: 20. IDs: tensor([ 101, 2021, 2054, 2055, 2117, 6350, 1029,  102,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

Decoded: [CLS] but what about second breakfast? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

Sequence 2 Length: 20. IDs: tensor([  101,  2123,  1005,  1056,  2228,  2002,  4282,  2055,  2117,  6350,
         1010, 28315,  1012,   102,     0,     0,     0,     0,     0,     0])

Decoded: [CLS] don ' t think he knows about second breakfast, pip. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

Sequence 3 Length: 20. IDs: tensor([  101,  2054,  2055,  5408, 14625,  1029,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

Decoded: [CLS] what about elevensies? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]