In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load tokenizer and explore chat template
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

messages = [
    {"role": "user", "content": "What is 2+2?\nWhat is 4+4?"},
]

In [3]:
# 2. Compare tokenize=True vs tokenize=False
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("Raw text:\n", text)

token_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)
print("Token count:", len(token_ids))
print("Decoded:", tokenizer.decode(token_ids))

Raw text:
 <|user|>
What is 2+2?
What is 4+4?<|end|>
<|assistant|>

Token count: 18
Decoded: <|user|> What is 2+2?
What is 4+4?<|end|><|assistant|>


In [4]:
print("With special tokens:", tokenizer.decode(token_ids, skip_special_tokens=False))
print("Without special tokens:", tokenizer.decode(token_ids, skip_special_tokens=True))

With special tokens: <|user|> What is 2+2?
What is 4+4?<|end|><|assistant|>
Without special tokens: What is 2+2?
What is 4+4?


In [5]:
# Investigate newline normalization around special tokens
print("=== Raw text repr ===")
print(repr(text))
print()
print("=== Decoded repr ===")
print(repr(tokenizer.decode(token_ids)))
print()

# Look at individual tokens to see what happened to newlines
print("=== Token-by-token breakdown ===")
for i, tid in enumerate(token_ids):
    decoded = tokenizer.decode([tid])
    print(f"Token {i}: id={tid:5d} -> {repr(decoded)}")

=== Raw text repr ===
'<|user|>\nWhat is 2+2?\nWhat is 4+4?<|end|>\n<|assistant|>\n'

=== Decoded repr ===
'<|user|> What is 2+2?\nWhat is 4+4?<|end|><|assistant|>'

=== Token-by-token breakdown ===
Token 0: id=32010 -> '<|user|>'
Token 1: id= 1724 -> 'What'
Token 2: id=  338 -> 'is'
Token 3: id=29871 -> ''
Token 4: id=29906 -> '2'
Token 5: id=29974 -> '+'
Token 6: id=29906 -> '2'
Token 7: id=29973 -> '?'
Token 8: id=   13 -> '\n'
Token 9: id= 5618 -> 'What'
Token 10: id=  338 -> 'is'
Token 11: id=29871 -> ''
Token 12: id=29946 -> '4'
Token 13: id=29974 -> '+'
Token 14: id=29946 -> '4'
Token 15: id=29973 -> '?'
Token 16: id=32007 -> '<|end|>'
Token 17: id=32001 -> '<|assistant|>'


In [6]:
# Check if they're actually different
print("Are they equal?", text == tokenizer.decode(token_ids))
print()
print("repr of text:")
print(repr(text))
print()
print("repr of decoded:")
print(repr(tokenizer.decode(token_ids)))

Are they equal? False

repr of text:
'<|user|>\nWhat is 2+2?\nWhat is 4+4?<|end|>\n<|assistant|>\n'

repr of decoded:
'<|user|> What is 2+2?\nWhat is 4+4?<|end|><|assistant|>'


In [7]:
# 3. Load and process a dataset
dataset = load_dataset("gsm8k", "main", split="train[:100]")
print("Columns:", dataset.column_names)

def add_prompt(row):
    row["messages"] = [{"role": "user", "content": row["question"]}]
    return row

dataset = dataset.map(add_prompt)
print("First example messages:", dataset[0]["messages"])

Columns: ['question', 'answer']
First example messages: [{'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]


In [8]:
dataset[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
 'messages': [{'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
   'role': 'user'}]}

In [9]:
text = tokenizer.apply_chat_template(dataset[0]["messages"], tokenize=False, add_generation_prompt=True)
print("Raw text:\n", text)

Raw text:
 <|user|>
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|end|>
<|assistant|>

