In [1]:
import keras
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

tf.random.set_seed(42)
print("GPUs:", tf.config.list_physical_devices('GPU'))
CONTEXT_LEN = 384 # 3x context length than previous
VOCAB_SIZE = 20000 # rx previous
from keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

2025-08-24 09:44:11.442859: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-24 09:44:11.500812: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-24 09:44:12.703834: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
import os
from typing import List, Dict, Tuple
import numpy as np
import tensorflow as tf
import sentencepiece as spm


@keras.saving.register_keras_serializable()
def train_sentencepiece_tokenizer(file_path_list: List[str], 
                                vocab_size: int = 2000,
                                model_prefix: str = 'spm_gpt') -> spm.SentencePieceProcessor:
    """
    Train SentencePiece tokenizer from text files (replaces tokenize_and_build_vocabulary_tf).
    
    Args:
        file_path_list: List of file paths containing the text corpus.
        vocab_size: Size of the subword vocabulary (default: 2000).
        model_prefix: Prefix for output model files.
    
    Returns:
        sp: Trained SentencePieceProcessor object.
    """
    if isinstance(file_path_list, (str, bytes)):
        file_path_list = [file_path_list]
    
    # Validate files (same as your original)
    for file_name in file_path_list:
        if os.path.isdir(file_name):
            raise IsADirectoryError(f"Expected file path, got directory: {file_name}")
        if not os.path.isfile(file_name):
            raise FileNotFoundError(f"File not found: {file_name}")
    
    # Combine all files into one input (or use comma-separated list)
    input_files = ','.join(file_path_list)
    
    # Train SentencePiece model
    spm.SentencePieceTrainer.train(
        input=input_files,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        character_coverage=0.9995,
        model_type='bpe',
        pad_id=0,
        unk_id=1,
        bos_id=2,
        eos_id=3,
        num_threads = 8
    )
    
    # Load and return processor
    sp = spm.SentencePieceProcessor()
    sp.load(f'{model_prefix}.model')
    return sp

@keras.saving.register_keras_serializable()
def tokenize_and_build_token_id_sp(sp: spm.SentencePieceProcessor, 
                                 text_batch: List[str], 
                                 max_seq_len: int, 
                                 pad_value: int = 0) -> Tuple[tf.Tensor, tf.Tensor]:
    """
    Tokenize batch of text using SentencePiece (replaces tokenize_and_build_token_id).
    
    Args:
        sp: Trained SentencePieceProcessor object.
        text_batch: List of text strings to tokenize.
        max_seq_len: Maximum sequence length after padding/truncation.
        pad_value: Integer ID used for padding tokens (should match sp.pad_id()).
    
    Returns:
        token_ids: tf.Tensor of shape (batch_size, max_seq_len), dtype tf.int32.
        attention_mask: tf.Tensor of shape (batch_size, max_seq_len), dtype tf.int32.
    """
    batch_token_ids = []
    
    for text in text_batch:
        # Encode text to subword IDs
        ids = sp.encode_as_ids(text)
        
        # Truncate if too long
        if len(ids) > max_seq_len:
            ids = ids[-max_seq_len:]  # Keep the end (recent context)
        else:
            # Pad to max_seq_len
            ids += [pad_value] * (max_seq_len - len(ids))
        
        batch_token_ids.append(ids)
    
    token_ids = np.array(batch_token_ids, dtype=np.int32)
    attention_mask = (token_ids != pad_value).astype(np.int32)
    
    return tf.constant(token_ids), tf.constant(attention_mask) # type: ignore

In [None]:
# New SentencePiece approach
print('hello')
sp = train_sentencepiece_tokenizer([r'/home/akshat/GPT_from_scratch/text_data/wikitext_full.txt'], vocab_size=VOCAB_SIZE)
VOCAB_SIZE = sp.get_piece_size()  
batch_text = ['yo','Akshat Khatri', 'Hello World','Me']
token_ids, attention_mask = tokenize_and_build_token_id_sp(sp, batch_text, CONTEXT_LEN)
token_ids