# Simple Data Processing for Character-level GPT
This notebook demonstrates clean and simple steps for preparing data for a character-level GPT model:
- Build vocabulary from text
- Tokenize and pad batches
- Prepare input-target pairs for training

In [30]:
# Import Required Libraries
import os
import numpy as np
import tensorflow as tf

In [31]:
# Build vocabulary from text file
def build_vocab(text_path):
    with open(text_path, encoding='utf-8') as f:
        text = f.read()
    vocab = sorted(set(text))
    token_to_id = {ch: i for i, ch in enumerate(vocab)}
    id_to_token = {i: ch for ch, i in token_to_id.items()}
    return token_to_id, id_to_token, vocab
text_path = '/home/akshat/GPT_from_scratch/text_data/jane_austen_clean.txt'
token_to_id, id_to_token, vocab = build_vocab(text_path)
print(f'Vocab size: {len(vocab)}')

Vocab size: 38


In [32]:
# Tokenize and pad batches of text
def tokenize_and_pad(text_batch, token_to_id, max_seq_len, pad_value=0):
    batch_token_ids = []
    for text in text_batch:
        ids = [token_to_id.get(c, pad_value) for c in text]
        if len(ids) > max_seq_len:
            ids = ids[:max_seq_len]
        else:
            ids += [pad_value] * (max_seq_len - len(ids))
        batch_token_ids.append(ids)
    token_ids = np.array(batch_token_ids, dtype=np.int32)
    attention_mask = (token_ids != pad_value).astype(np.int32)
    return token_ids, attention_mask
batch_text = ['Hello', 'World!', 'GPT']
max_seq_len = 10
token_ids, attention_mask = tokenize_and_pad(batch_text, token_to_id, max_seq_len)
print('Token IDs:', token_ids)
print('Attention Mask:', attention_mask)

Token IDs: [[ 0 16 23 23 26  0  0  0  0  0]
 [ 0 26 29 23 15  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
Attention Mask: [[0 1 1 1 1 0 0 0 0 0]
 [0 1 1 1 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


In [33]:
# Prepare input-target pairs for training with sliding window
def prepare_training_data(text, token_to_id, context_length=10, pad_value=0):
    token_ids = [token_to_id.get(c, pad_value) for c in text]
    inputs, targets = [], []
    for i in range(0, len(token_ids) - context_length):
        input_seq = token_ids[i:i+context_length]
        target_seq = token_ids[i+1:i+context_length+1]
        inputs.append(input_seq)
        targets.append(target_seq)
    inputs = np.array(inputs, dtype=np.int32)
    targets = np.array(targets, dtype=np.int32)
    return inputs, targets
inputs, targets = prepare_training_data('Hello GPT World!', token_to_id, context_length=5)
print('Inputs:', inputs)
print('Targets:', targets)

Inputs: [[ 0 16 23 23 26]
 [16 23 23 26  1]
 [23 23 26  1  0]
 [23 26  1  0  0]
 [26  1  0  0  0]
 [ 1  0  0  0  1]
 [ 0  0  0  1  0]
 [ 0  0  1  0 26]
 [ 0  1  0 26 29]
 [ 1  0 26 29 23]
 [ 0 26 29 23 15]]
Targets: [[16 23 23 26  1]
 [23 23 26  1  0]
 [23 26  1  0  0]
 [26  1  0  0  0]
 [ 1  0  0  0  1]
 [ 0  0  0  1  0]
 [ 0  0  1  0 26]
 [ 0  1  0 26 29]
 [ 1  0 26 29 23]
 [ 0 26 29 23 15]
 [26 29 23 15  2]]


In [34]:
# Word-level vocabulary and tokenization (with max_words limit)
from collections import Counter
def build_word_vocab(text_path, max_words=10000):
    with open(text_path, encoding='utf-8') as f:
        text = f.read()
    words = text.split()
    word_counts = Counter(words)
    most_common = word_counts.most_common(max_words)
    vocab = [w for w, _ in most_common]
    token_to_id = {word: i for i, word in enumerate(vocab)}
    id_to_token = {i: word for word, i in token_to_id.items()}
    print(f'Vocabulary size (limited): {len(vocab)}')
    return token_to_id, id_to_token, vocab
word_token_to_id, word_id_to_token, word_vocab = build_word_vocab(text_path, max_words=10000)
print(f'Word vocab size: {len(word_vocab)}')

def tokenize_and_pad_words(text_batch, token_to_id, max_seq_len, pad_value=0):
    batch_token_ids = []
    for text in text_batch:
        words = text.split()
        ids = [token_to_id.get(w, pad_value) for w in words]
        if len(ids) > max_seq_len:
            ids = ids[:max_seq_len]
        else:
            ids += [pad_value] * (max_seq_len - len(ids))
        batch_token_ids.append(ids)
    token_ids = np.array(batch_token_ids, dtype=np.int32)
    attention_mask = (token_ids != pad_value).astype(np.int32)
    return token_ids, attention_mask
batch_text_words = ['Hello world', 'GPT is great', 'Word level']
max_seq_len_words = 6
word_token_ids, word_attention_mask = tokenize_and_pad_words(batch_text_words, word_token_to_id, max_seq_len_words)
print('Word Token IDs:', word_token_ids)
print('Word Attention Mask:', word_attention_mask)

Vocabulary size (limited): 10000
Word vocab size: 10000
Word Token IDs: [[   0  416    0    0    0    0]
 [   0   23   92    0    0    0]
 [   0 8602    0    0    0    0]]
Word Attention Mask: [[0 1 0 0 0 0]
 [0 1 1 0 0 0]
 [0 1 0 0 0 0]]


In [35]:
# Prepare word-level input-target pairs for training with sliding window (verbose for debugging)
def prepare_word_training_data(text, token_to_id, context_length=6, pad_value=0):
    print(f'Input text: {text}')
    words = text.split()
    print(f'Split words: {words}')
    word_ids = [token_to_id.get(w, pad_value) for w in words]
    print(f'Word IDs: {word_ids}')
    inputs, targets = [], []
    for i in range(0, len(word_ids) - context_length):
        input_seq = word_ids[i:i+context_length]
        target_seq = word_ids[i+1:i+context_length+1]
        print(f'Window {i}:')
        print(f'  Input IDs: {input_seq}')
        print(f'  Target IDs: {target_seq}')
        inputs.append(input_seq)
        targets.append(target_seq)
    inputs = np.array(inputs, dtype=np.int32)
    targets = np.array(targets, dtype=np.int32)
    print('Final input array:', inputs)
    print('Final target array:', targets)
    return inputs, targets
word_inputs, word_targets = prepare_word_training_data('Hello world this is a word-level GPT example', word_token_to_id, context_length=4)
print('Word Inputs:', word_inputs)
print('Word Targets:', word_targets)

Input text: Hello world this is a word-level GPT example
Split words: ['Hello', 'world', 'this', 'is', 'a', 'word-level', 'GPT', 'example']
Word IDs: [0, 416, 42, 23, 4, 0, 0, 3357]
Window 0:
  Input IDs: [0, 416, 42, 23]
  Target IDs: [416, 42, 23, 4]
Window 1:
  Input IDs: [416, 42, 23, 4]
  Target IDs: [42, 23, 4, 0]
Window 2:
  Input IDs: [42, 23, 4, 0]
  Target IDs: [23, 4, 0, 0]
Window 3:
  Input IDs: [23, 4, 0, 0]
  Target IDs: [4, 0, 0, 3357]
Final input array: [[  0 416  42  23]
 [416  42  23   4]
 [ 42  23   4   0]
 [ 23   4   0   0]]
Final target array: [[ 416   42   23    4]
 [  42   23    4    0]
 [  23    4    0    0]
 [   4    0    0 3357]]
Word Inputs: [[  0 416  42  23]
 [416  42  23   4]
 [ 42  23   4   0]
 [ 23   4   0   0]]
Word Targets: [[ 416   42   23    4]
 [  42   23    4    0]
 [  23    4    0    0]
 [   4    0    0 3357]]


In [36]:
# Debugging: Show every step for character-level processing
def build_vocab_debug(text_path):
    print(f'Reading file: {text_path}')
    with open(text_path, encoding='utf-8') as f:
        text = f.read()
    print('Raw text sample:', text[:100])
    vocab = sorted(set(text))
    print('Vocabulary:', vocab)
    token_to_id = {ch: i for i, ch in enumerate(vocab)}
    id_to_token = {i: ch for ch, i in token_to_id.items()}
    print('Token to ID mapping:', token_to_id)
    print('ID to Token mapping:', id_to_token)
    print(f'Vocab size: {len(vocab)}')
    return token_to_id, id_to_token, vocab
token_to_id, id_to_token, vocab = build_vocab_debug(text_path)

def tokenize_and_pad_debug(text_batch, token_to_id, max_seq_len, pad_value=0):
    print('Batch to tokenize:', text_batch)
    batch_token_ids = []
    for text in text_batch:
        ids = [token_to_id.get(c, pad_value) for c in text]
        print(f'Text: {text} -> IDs: {ids}')
        if len(ids) > max_seq_len:
            ids = ids[:max_seq_len]
            print(f'Truncated IDs: {ids}')
        else:
            ids += [pad_value] * (max_seq_len - len(ids))
            print(f'Padded IDs: {ids}')
        batch_token_ids.append(ids)
    token_ids = np.array(batch_token_ids, dtype=np.int32)
    attention_mask = (token_ids != pad_value).astype(np.int32)
    print('Token IDs array:', token_ids)
    print('Attention mask array:', attention_mask)
    return token_ids, attention_mask
batch_text = ['Hello', 'World!', 'GPT']
max_seq_len = 10
token_ids, attention_mask = tokenize_and_pad_debug(batch_text, token_to_id, max_seq_len)

Reading file: /home/akshat/GPT_from_scratch/text_data/jane_austen_clean.txt
Raw text sample:  sir walter elliot, of kellynch hall, in somersetshire, was a man who,
for his own amusement, never 
Vocabulary: ['\n', ' ', '!', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Token to ID mapping: {'\n': 0, ' ': 1, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, 'a': 12, 'b': 13, 'c': 14, 'd': 15, 'e': 16, 'f': 17, 'g': 18, 'h': 19, 'i': 20, 'j': 21, 'k': 22, 'l': 23, 'm': 24, 'n': 25, 'o': 26, 'p': 27, 'q': 28, 'r': 29, 's': 30, 't': 31, 'u': 32, 'v': 33, 'w': 34, 'x': 35, 'y': 36, 'z': 37}
ID to Token mapping: {0: '\n', 1: ' ', 2: '!', 3: "'", 4: '(', 5: ')', 6: ',', 7: '-', 8: '.', 9: ':', 10: ';', 11: '?', 12: 'a', 13: 'b', 14: 'c', 15: 'd', 16: 'e', 17: 'f', 18: 'g', 19: 'h', 20: 'i', 21: 'j', 22: 'k', 23: 'l', 24: 'm', 25: 'n', 

In [37]:
# Debugging: Show every step for word-level processing
def build_word_vocab_debug(text_path, max_words=10000):
    print(f'Reading file: {text_path}')
    with open(text_path, encoding='utf-8') as f:
        text = f.read()
    print('Raw text sample:', text[:100])
    words = text.split()
    print(f'Total words: {len(words)}')
    word_counts = Counter(words)
    most_common = word_counts.most_common(max_words)
    print(f'Most common words: {[w for w, _ in most_common[:20]]}')
    vocab = [w for w, _ in most_common]
    print(f'Vocabulary (limited to {max_words}):', vocab[:20], '...')
    token_to_id = {word: i for i, word in enumerate(vocab)}
    id_to_token = {i: word for word, i in token_to_id.items()}
    print('Token to ID mapping (sample):', dict(list(token_to_id.items())[:20]))
    print('ID to Token mapping (sample):', dict(list(id_to_token.items())[:20]))
    print(f'Vocabulary size (limited): {len(vocab)}')
    return token_to_id, id_to_token, vocab
word_token_to_id, word_id_to_token, word_vocab = build_word_vocab_debug(text_path, max_words=10000)

def tokenize_and_pad_words_debug(text_batch, token_to_id, max_seq_len, pad_value=0):
    print('Batch to tokenize:', text_batch)
    batch_token_ids = []
    for text in text_batch:
        words = text.split()
        print(f'Text: {text} -> Words: {words}')
        ids = [token_to_id.get(w, pad_value) for w in words]
        print(f'Word IDs: {ids}')
        if len(ids) > max_seq_len:
            ids = ids[:max_seq_len]
            print(f'Truncated IDs: {ids}')
        else:
            ids += [pad_value] * (max_seq_len - len(ids))
            print(f'Padded IDs: {ids}')
        batch_token_ids.append(ids)
    token_ids = np.array(batch_token_ids, dtype=np.int32)
    attention_mask = (token_ids != pad_value).astype(np.int32)
    print('Token IDs array:', token_ids)
    print('Attention mask array:', attention_mask)
    return token_ids, attention_mask
batch_text_words = ['Hello world', 'GPT is great', 'Word level']
max_seq_len_words = 6
word_token_ids, word_attention_mask = tokenize_and_pad_words_debug(batch_text_words, word_token_to_id, max_seq_len_words)

Reading file: /home/akshat/GPT_from_scratch/text_data/jane_austen_clean.txt
Raw text sample:  sir walter elliot, of kellynch hall, in somersetshire, was a man who,
for his own amusement, never 
Total words: 779561
Most common words: ['the', 'to', 'and', 'of', 'a', 'i', 'her', 'in', 'was', 'she', 'not', 'that', 'it', 'be', 'as', 'had', 'he', 'you', 'for', 'his']
Vocabulary (limited to 10000): ['the', 'to', 'and', 'of', 'a', 'i', 'her', 'in', 'was', 'she', 'not', 'that', 'it', 'be', 'as', 'had', 'he', 'you', 'for', 'his'] ...
Token to ID mapping (sample): {'the': 0, 'to': 1, 'and': 2, 'of': 3, 'a': 4, 'i': 5, 'her': 6, 'in': 7, 'was': 8, 'she': 9, 'not': 10, 'that': 11, 'it': 12, 'be': 13, 'as': 14, 'had': 15, 'he': 16, 'you': 17, 'for': 18, 'his': 19}
ID to Token mapping (sample): {0: 'the', 1: 'to', 2: 'and', 3: 'of', 4: 'a', 5: 'i', 6: 'her', 7: 'in', 8: 'was', 9: 'she', 10: 'not', 11: 'that', 12: 'it', 13: 'be', 14: 'as', 15: 'had', 16: 'he', 17: 'you', 18: 'for', 19: 'his'}
Vocabula