# Tokenizer & Training Data Inspection

In [1]:
from transformers import AutoTokenizer
import pyarrow.parquet as pq
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


## Special Tokens

In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Total tokens (with special): {len(tokenizer)}")
print()

special = {
    "bos_token": tokenizer.bos_token,
    "eos_token": tokenizer.eos_token,
    "pad_token": tokenizer.pad_token,
    "unk_token": tokenizer.unk_token,
    "sep_token": tokenizer.sep_token,
    "cls_token": tokenizer.cls_token,
    "mask_token": tokenizer.mask_token,
}

for name, tok in special.items():
    if tok is not None:
        tok_id = tokenizer.convert_tokens_to_ids(tok)
        print(f"{name:>12}: {tok!r:>15}  (id={tok_id})")
    else:
        print(f"{name:>12}: None")

print(f"\nAll special tokens: {tokenizer.all_special_tokens}")
print(f"All special token IDs: {tokenizer.all_special_ids}")

Vocab size: 50257
Total tokens (with special): 50257

   bos_token: '<|endoftext|>'  (id=50256)
   eos_token: '<|endoftext|>'  (id=50256)
   pad_token: None
   unk_token: '<|endoftext|>'  (id=50256)
   sep_token: None
   cls_token: None
  mask_token: None

All special tokens: ['<|endoftext|>']
All special token IDs: [50256]


## Training Data Example

In [3]:
# Load first row from the first training shard (shard_00001, since shard_00000 is val)
train_shard = Path("../data/base_data/shard_00001.parquet")
pf = pq.ParquetFile(train_shard)
table = pf.read_row_group(0, columns=["text"])
raw_text = table["text"][0].as_py()

print(f"Raw text ({len(raw_text)} chars):")
print("=" * 80)
print(raw_text[:1000])
print("=" * 80)

Raw text (978 chars):
How to run or embed an applet
The previous page showed the skeleton for a basic Java applet
from the point of view of the code itself. But we still haven't seen how to run a Java applet,
which usually means how to embed the applet in a web page. For this, the procedure is usually
- compile the applet into a jar, using the jar tool supplied
with the JDK;
- put the jar next to the HTML file of the web page
(i.e. in the same directory) in which the applet will be embedded;
- include an appropriate applet tag in your web page's HTML definition.
Compiling the jar
A jar or Java archive is essentially a glorified ZIP file (in fact,
it is a ZIP file) containing
your Java classes plus other resources used by the applet or application.
If you enjoy this Java programming article, please share with friends and colleagues. Follow the author on Twitter for the latest news and rants.
Editorial page content written by Neil Coffey. Copyright © Javamex UK 2021. All rights reserved.

In [4]:
# Tokenize (same as StreamingParquetDataset)
tokens = tokenizer.encode(raw_text, add_special_tokens=False)
tokens.append(tokenizer.eos_token_id)  # EOS appended between documents

SEQ_LEN = 2048
input_ids = tokens[:SEQ_LEN]
labels = tokens[1:SEQ_LEN + 1]

print(f"Total tokens in document: {len(tokens)}")
print(f"Input sequence length: {len(input_ids)}")
print(f"Label sequence length: {len(labels)}")

Total tokens in document: 237
Input sequence length: 237
Label sequence length: 236


In [5]:
# Show first 50 tokens: ID, token string, and the label it predicts
N = 50
print(f"{'idx':>4}  {'input_id':>9}  {'input_token':>20}  {'label_id':>9}  {'label_token':>20}")
print("-" * 70)
for i in range(N):
    in_id = input_ids[i]
    lb_id = labels[i]
    in_tok = repr(tokenizer.decode([in_id]))
    lb_tok = repr(tokenizer.decode([lb_id]))
    print(f"{i:>4}  {in_id:>9}  {in_tok:>20}  {lb_id:>9}  {lb_tok:>20}")

 idx   input_id           input_token   label_id           label_token
----------------------------------------------------------------------
   0       2437                 'How'        284                 ' to'
   1        284                 ' to'       1057                ' run'
   2       1057                ' run'        393                 ' or'
   3        393                 ' or'      11525              ' embed'
   4      11525              ' embed'        281                 ' an'
   5        281                 ' an'        598                ' app'
   6        598                ' app'       1616                 'let'
   7       1616                 'let'        198                  '\n'
   8        198                  '\n'        464                 'The'
   9        464                 'The'       2180           ' previous'
  10       2180           ' previous'       2443               ' page'
  11       2443               ' page'       3751             ' showed'
  12  

In [6]:
# Decode full input and label sequences back to text
print("INPUT SEQUENCE (decoded):")
print("=" * 80)
print(tokenizer.decode(input_ids))
print("=" * 80)
print()
print("LABEL SEQUENCE (decoded, shifted by 1):")
print("=" * 80)
print(tokenizer.decode(labels))
print("=" * 80)

INPUT SEQUENCE (decoded):
How to run or embed an applet
The previous page showed the skeleton for a basic Java applet
from the point of view of the code itself. But we still haven't seen how to run a Java applet,
which usually means how to embed the applet in a web page. For this, the procedure is usually
- compile the applet into a jar, using the jar tool supplied
with the JDK;
- put the jar next to the HTML file of the web page
(i.e. in the same directory) in which the applet will be embedded;
- include an appropriate applet tag in your web page's HTML definition.
Compiling the jar
A jar or Java archive is essentially a glorified ZIP file (in fact,
it is a ZIP file) containing
your Java classes plus other resources used by the applet or application.
If you enjoy this Java programming article, please share with friends and colleagues. Follow the author on Twitter for the latest news and rants.
Editorial page content written by Neil Coffey. Copyright © Javamex UK 2021. All rights reser