In [None]:
%pip install tiktoken

# GPT-2 Training Data Preparation

Dataset: [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories)

This notebook prepares training data for GPT-2 pretraining.

In [25]:
# notebook cell
from datasets import load_dataset
ds = load_dataset("roneneldan/TinyStories")
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."


#### ============ STEP 1: Single Story Memorization Test ============

In [26]:
test_story = df.iloc[0]['text']
print("Test story:")
print(test_story)
print(f"\nLength: {len(test_story)} characters")

Test story:
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.

Length: 701 characters


##### ============ STEP 2: Tokenize with GPT-2 ============

In [27]:
import tiktoken
from pathlib import Path
import struct

# Load GPT-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Tokenize the story
tokens = tokenizer.encode(test_story)

print(f"Token count: {len(tokens)}")
print(f"First 20 tokens: {tokens[:1024]}")

# Verify decoding works
decoded = tokenizer.decode(tokens[:1024])
print(f"\nDecoded first 20: {decoded}")

Token count: 162
First 20 tokens: [3198, 1110, 11, 257, 1310, 2576, 3706, 20037, 1043, 257, 17598, 287, 607, 2119, 13, 1375, 2993, 340, 373, 2408, 284, 711, 351, 340, 780, 340, 373, 7786, 13, 20037, 2227, 284, 2648, 262, 17598, 351, 607, 1995, 11, 523, 673, 714, 34249, 257, 4936, 319, 607, 10147, 13, 198, 198, 43, 813, 1816, 284, 607, 1995, 290, 531, 11, 366, 29252, 11, 314, 1043, 428, 17598, 13, 1680, 345, 2648, 340, 351, 502, 290, 34249, 616, 10147, 1701, 2332, 1995, 13541, 290, 531, 11, 366, 5297, 11, 20037, 11, 356, 460, 2648, 262, 17598, 290, 4259, 534, 10147, 526, 198, 198, 41631, 11, 484, 4888, 262, 17598, 290, 384, 19103, 262, 4936, 319, 20037, 338, 10147, 13, 632, 373, 407, 2408, 329, 606, 780, 484, 547, 7373, 290, 5742, 1123, 584, 13, 2293, 484, 5201, 11, 20037, 26280, 607, 1995, 329, 7373, 262, 17598, 290, 18682, 607, 10147, 13, 1119, 1111, 2936, 3772, 780, 484, 550, 4888, 290, 3111, 1978, 13]

Decoded first 20: One day, a little girl named Lily found a needle in her room. S

#### Pack stories until you hit a target token count

In [28]:
target_tokens = 10000  # Aim for 10k tokens

packed_stories = []
current_tokens = 0

for idx, story in enumerate(df['text']):
    packed_stories.append(story)
    current_tokens = len(tokenizer.encode("\n\n".join(packed_stories)))
    
    if current_tokens >= target_tokens:
        break

packed_text = "\n\n".join(packed_stories)
tokens = tokenizer.encode(packed_text)

print(f"Packed {len(packed_stories)} stories")
print(f"Total tokens: {len(tokens)}")
print(f"Training pairs: {len(tokens) - 1024}")

Packed 57 stories
Total tokens: 10054
Training pairs: 9030


In [29]:
# Preview the packed text
print("First 500 characters:")
print(packed_text[:500])
print("\n...")
print("\nLast 500 characters:")
print(packed_text[-500:])

First 500 characters:
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them b

...

Last 500 characters:

The lawyer was very confused. He had never seen a little girl so eager to talk to him. He tried to explain but Mia kept on jumping and interrupting.

Soon enough the store manager got involved. He explained to Mia that it was not appropriate to engage with strangers and it was wrong to interrupt people when they were talking.

Mia was very sorry for her behaviour and decided to never do something like this again. She had learned her lesson that 

In [30]:
# ============ STEP 3: Create Training Pairs ============

context_length = 1024

training_pairs = []

# Slide window by 1 token
for i in range(len(tokens) - context_length):
    # Extract 1025 tokens: [input[0:1024], target[1:1025]]
    sequence = tokens[i:i+context_length+1]
    
    training_pairs.append({
        'pair_id': i,
        'tokens': sequence  # 1025 tokens
    })

df_pairs = pd.DataFrame(training_pairs)

print(f"Created {len(df_pairs)} training pairs")
print(f"Each pair: 1025 tokens")
print(f"  - Input:   tokens[0:1024]")
print(f"  - Targets: tokens[1:1025]")

Created 9030 training pairs
Each pair: 1025 tokens
  - Input:   tokens[0:1024]
  - Targets: tokens[1:1025]


In [31]:
# Inspect first training pair
pair = df_pairs.iloc[0]

print("Training Pair 0:")
print("="*60)

# Show input tokens (first 1024)
input_tokens = pair['tokens'][:1024]
print(f"Input: {len(input_tokens)} tokens")
print(f"First 10 tokens: {input_tokens[:10]}")
print(f"Decoded (first 100 chars): {tokenizer.decode(input_tokens[:50])}")

print("\n" + "="*60)

# Show what model should predict at each position
target_tokens = pair['tokens'][1:1025]
print(f"Targets: {len(target_tokens)} tokens")
print(f"First 10 targets: {target_tokens[:10]}")
print(f"Decoded (first 100 chars): {tokenizer.decode(target_tokens[:50])}")

print("\n" + "="*60)
print("Model predicts:")
print(f"  After input[0] → should predict target[0] (token {target_tokens[0]})")
print(f"  After input[1] → should predict target[1] (token {target_tokens[1]})")
print(f"  ...")
print(f"  After input[1023] → should predict target[1023] (token {target_tokens[1023]})")

Training Pair 0:
Input: 1024 tokens
First 10 tokens: [3198, 1110, 11, 257, 1310, 2576, 3706, 20037, 1043, 257]
Decoded (first 100 chars): One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.


Targets: 1024 tokens
First 10 targets: [1110, 11, 257, 1310, 2576, 3706, 20037, 1043, 257, 17598]
Decoded (first 100 chars):  day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.



Model predicts:
  After input[0] → should predict target[0] (token 1110)
  After input[1] → should predict target[1] (token 11)
  ...
  After input[1023] → should predict target[1023] (token 679)


In [32]:
# ============ STEP 4: Save Training Pairs as Binary Files ============

from pathlib import Path
import struct

pairs_dir = Path("./data/training_pairs")
pairs_dir.mkdir(parents=True, exist_ok=True)

print(f"Saving {len(df_pairs)} pairs to {pairs_dir}")

for idx, row in df_pairs.iterrows():
    sequence = row['tokens']  # 1025 tokens
    
    # Save as binary: unsigned 32-bit integers
    pair_file = pairs_dir / f"pair_{idx:05d}.bin"
    with open(pair_file, 'wb') as f:
        f.write(struct.pack(f'{len(sequence)}I', *sequence))
    
    if idx % 1000 == 0:
        print(f"  Saved {idx}/{len(df_pairs)}")

print(f"\n✓ Complete! Saved {len(df_pairs)} binary files")

Saving 9030 pairs to data/training_pairs
  Saved 0/9030
  Saved 1000/9030
  Saved 2000/9030
  Saved 3000/9030
  Saved 4000/9030
  Saved 5000/9030
  Saved 6000/9030
  Saved 7000/9030
  Saved 8000/9030
  Saved 9000/9030

✓ Complete! Saved 9030 binary files


In [33]:
# ============ STEP 5: Verify Binary Files ============

# Read back first file to verify
test_file = pairs_dir / "pair_00000.bin"

with open(test_file, 'rb') as f:
    loaded = list(struct.unpack('1025I', f.read()))

print(f"Loaded {len(loaded)} tokens from {test_file.name}")
print(f"First 10: {loaded[:10]}")
print(f"Matches original: {loaded == df_pairs.iloc[0]['tokens']}")

Loaded 1025 tokens from pair_00000.bin
First 10: [3198, 1110, 11, 257, 1310, 2576, 3706, 20037, 1043, 257]
Matches original: True


In [34]:
# ============ SUMMARY FOR C CODE ============

print("="*60)
print("TRAINING DATA READY")
print("="*60)
print(f"Total stories packed:  {len(packed_stories)}")
print(f"Total tokens:          {len(tokens)}")
print(f"Training pairs:        {len(df_pairs)}")
print(f"Context length:        {context_length}")
print(f"Tokens per file:       1025")
print(f"Files location:        {pairs_dir}")
print()
print("C Code should:")
print("  1. Load pair_XXXXX.bin (1025 tokens)")
print("  2. int input[1024] = tokens[0:1024]")
print("  3. int targets[1024] = tokens[1:1025]")
print("  4. forward(model, input, 1024)")
print("  5. loss = cross_entropy(model->logits, targets, 1024)")
print("  6. backward(model, targets, 1024)")
print("  7. optimizer_step(model)")
print()
print(f"Expected epochs: 10-50")
print(f"Expected time per epoch: ~{len(df_pairs)} steps")

TRAINING DATA READY
Total stories packed:  57
Total tokens:          10054
Training pairs:        9030
Context length:        1024
Tokens per file:       1025
Files location:        data/training_pairs

C Code should:
  1. Load pair_XXXXX.bin (1025 tokens)
  2. int input[1024] = tokens[0:1024]
  3. int targets[1024] = tokens[1:1025]
  4. forward(model, input, 1024)
  5. loss = cross_entropy(model->logits, targets, 1024)
  6. backward(model, targets, 1024)
  7. optimizer_step(model)

Expected epochs: 10-50
Expected time per epoch: ~9030 steps


In [35]:
# ============ SAVE METADATA ============

# Save metadata for reference
metadata = {
    'num_stories': len(packed_stories),
    'num_tokens': len(tokens),
    'num_pairs': len(df_pairs),
    'context_length': context_length,
    'vocab_size': 50257,
    'tokenizer': 'gpt2'
}

import json
metadata_file = pairs_dir / "metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Saved metadata to {metadata_file}")
print(json.dumps(metadata, indent=2))

Saved metadata to data/training_pairs/metadata.json
{
  "num_stories": 57,
  "num_tokens": 10054,
  "num_pairs": 9030,
  "context_length": 1024,
  "vocab_size": 50257,
  "tokenizer": "gpt2"
}
