# Tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification

In [4]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
text = "Hello,how are you?"
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens:",tokens)
print("Token IDs:",token_ids)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokens: ['hello', ',', 'how', 'are', 'you', '?']
Token IDs: [7592, 1010, 2129, 2024, 2017, 1029]


In [5]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
text = "Hello,how are you?"
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens:",tokens)
print("Token IDs:",token_ids)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokens: ['Hello', ',', 'how', 'Ġare', 'Ġyou', '?']
Token IDs: [31414, 6, 9178, 32, 47, 116]


In [6]:
from transformers import GPT2Tokenizer

# GPT2 Tokenizer example
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

text = "Hello, how are you?"
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Tokens: ['Hello', ',', 'Ġhow', 'Ġare', 'Ġyou', '?']
Token IDs: [15496, 11, 703, 389, 345, 30]


In [7]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize a sample text
text = "Hello, how are you?"
tokens = tokenizer.tokenize(text)  # Breaks text into subword tokens
token_ids = tokenizer.convert_tokens_to_ids(tokens)  # Converts tokens into their IDs

# Print the results
print("BERT Tokenizer Output:")
print("Tokens:", tokens)
print("Token IDs:", token_ids)

# Get full encoded output as tensors
print("Encoded:", tokenizer.encode_plus(text, return_tensors='pt'))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

BERT Tokenizer Output:
Tokens: ['hello', ',', 'how', 'are', 'you', '?']
Token IDs: [7592, 1010, 2129, 2024, 2017, 1029]
Encoded: {'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [8]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example input sentences
sentence1 = "Hello, how are you?"
sentence2 = "I'm fine, thank you."

# Tokenize the sentences with special tokens
encoded = tokenizer.encode_plus(
    sentence1,
    sentence2,                     # Provide the second sentence
    add_special_tokens=True,       # Add [CLS], [SEP] tokens
    padding='max_length',          # Pad to a fixed length
    max_length=12,                 # Max length of the sequence
    truncation=True,               # Truncate if input exceeds max length
    return_tensors='pt'            # Return PyTorch tensors
)

# Print the encoded outputs
print("Encoded Input IDs:", encoded['input_ids'])
print("Token Type IDs:", encoded['token_type_ids'])
print("Attention Mask:", encoded['attention_mask'])


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Encoded Input IDs: tensor([[ 101, 7592, 1010, 2129, 2024, 2017,  102, 1045, 1005, 1049, 2986,  102]])
Token Type IDs: tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


### 📌 Reminder of Inputs

```python
sentence1 = "Hello, how are you?"
sentence2 = "I'm fine, thank you."
max_length = 12
```

---

## 🧾 1. **Encoded Input IDs**

```python
tensor([[ 101, 7592, 1010, 2129, 2024, 2017,  102, 1045, 1005, 1049, 2986,  102]])
```

These are **token IDs** — numeric representations of subwords, based on BERT's vocabulary.

| Token   | ID   | Meaning                              |
| ------- | ---- | ------------------------------------ |
| `[CLS]` | 101  | Special token marking sentence start |
| `hello` | 7592 | Word from `sentence1`                |
| `,`     | 1010 | Comma                                |
| `how`   | 2129 | Word                                 |
| `are`   | 2024 | Word                                 |
| `you`   | 2017 | Word                                 |
| `[SEP]` | 102  | Separator between sentence1 and 2    |
| `i`     | 1045 | Word from `sentence2`                |
| `'m`    | 1005 | Tokenized as subword (I am)          |
| `fine`  | 1049 | Word                                 |
| `thank` | 2986 | Word                                 |
| `[SEP]` | 102  | End of second sentence               |

➡️ The sequence includes **\[CLS] + sentence1 + \[SEP] + sentence2 + \[SEP]**.
➡️ It's exactly 12 tokens, matching `max_length=12`.

---

## 🧭 2. **Token Type IDs**

```python
tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]])
```

This tells the model **which token belongs to which sentence**:

* **0** = sentence1 (including `[CLS]` and its `[SEP]`)
* **1** = sentence2 (up to its own `[SEP]`)

So:

* IDs 0–6: `[CLS]`, “Hello, how are you?”, `[SEP]` → sentence 1
* IDs 7–11: `"I'm fine, thank"` and `[SEP]` → sentence 2

---

## 🕶️ 3. **Attention Mask**

```python
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
```

* **1** = real token
* **0** = padding

Since your `max_length` is 12 and the input exactly uses 12 tokens, there's **no padding**, so all values are `1`.

---

## ⚠️ Overflowing Tokens Message

> *"Overflowing tokens are not returned..."*

This means that because you're using `truncation=True` with `max_length=12`, any extra tokens are simply cut off. The tokenizer doesn't store or return what it cut. That’s why you see no overflow tokens.

For example, if the input had 14 tokens, the tokenizer would:

* Truncate to 12
* Discard 2
* **But not tell you which ones were discarded**, unless you set `return_overflowing_tokens=True`.

---

### ✅ Summary

| Component        | Meaning                                       |
| ---------------- | --------------------------------------------- |
| `input_ids`      | Numeric form of tokenized text                |
| `token_type_ids` | Distinguishes sentence1 (0) and sentence2 (1) |
| `attention_mask` | 1 for real tokens, 0 for padding              |


In [9]:
token_ids = [101, 7592, 1010, 2129, 2024, 2017, 1029, 102]

# Decoding the token IDs back to text
decoded_text = tokenizer.decode(token_ids)

# Print the decoded text
print("Decoded Text:", decoded_text)

Decoded Text: [CLS] hello, how are you? [SEP]
