## Step 0: Import Required Libraries

In [1]:
import os
import sys
import numpy as np
import cv2
import pickle
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import torch
import torch.nn.functional as F
from pathlib import Path

# Add parent directory to path
parent_dir = Path(os.getcwd()).parent
sys.path.insert(0, str(parent_dir))
sys.path.insert(0, str(parent_dir / 'word_segmentation'))
sys.path.insert(0, str(parent_dir / 'gpt-2-train'))

print("✓ Libraries imported successfully")
print(f"TensorFlow version: {tf.__version__}")
print(f"PyTorch version: {torch.__version__}")

✓ Libraries imported successfully
TensorFlow version: 2.20.0
PyTorch version: 2.9.1


## Step 1: Load Word Segmentation Module

We'll use the `WordSegmenter` class to detect and extract bounding boxes around individual words.

In [2]:
# Import word segmentation module
from segmenter import WordSegmenter

# Initialize the word segmenter with default parameters
word_segmenter = WordSegmenter(
    blur_kernel=(3, 3),
    blur_sigma=1,
    morph_kernel=(3, 3),
    dilation_kernel=(1, 3),
    min_width=15,
    min_height=10,
    max_width_ratio=0.9,
    max_height_ratio=0.5,
    min_fill_ratio=0.1
)

print("✓ Word Segmenter initialized successfully")

✓ Word Segmenter initialized successfully


## Step 2: Load OCR Model and Encoder

We'll load the trained HTR model from the `ocr_weights` directory.

In [3]:
# Define Character Encoder class
class CharacterEncoder:
    """Encode and decode characters for model training/inference"""
    
    def __init__(self, characters=None):
        if characters is None:
            # Default character set
            self.characters = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.,!?'-"
        else:
            self.characters = characters
        
        # Create character to index mapping
        self.char_to_num = {char: idx for idx, char in enumerate(self.characters)}
        
        # Create index to character mapping
        self.num_to_char = {idx: char for char, idx in self.char_to_num.items()}
        
        # Vocab size includes all characters + blank token
        self.vocab_size = len(self.characters) + 1
        self.blank_token_idx = len(self.characters)
    
    def encode(self, text):
        """Encode text to numerical indices"""
        encoded = []
        for char in text:
            if char in self.char_to_num:
                encoded.append(self.char_to_num[char])
        return encoded
    
    def decode(self, indices):
        """Decode numerical indices to text"""
        decoded = []
        for idx in indices:
            if idx < len(self.characters) and idx in self.num_to_char:
                decoded.append(self.num_to_char[idx])
        return ''.join(decoded)

# Initialize encoder
encoder = CharacterEncoder()
print(f"✓ Character Encoder initialized (vocab size: {encoder.vocab_size})")

✓ Character Encoder initialized (vocab size: 70)


In [20]:
# Build the CRNN model architecture
def build_crnn_model(input_shape=(32, 128, 1), num_classes=79):
    """Build CRNN model architecture for HTR"""
    from tensorflow.keras import layers, Model
    
    # Input layer
    input_layer = layers.Input(shape=input_shape, name='input_1')
    
    # Convolutional Block 1
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same', name='conv2d')(input_layer)
    x = layers.MaxPooling2D((2, 2), name='max_pooling2d')(x)
    
    # Convolutional Block 2
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same', name='conv2d_1')(x)
    x = layers.MaxPooling2D((2, 2), name='max_pooling2d_1')(x)
    
    # Convolutional Block 3
    x = layers.Conv2D(256, (3, 3), activation='relu', padding='same', name='conv2d_2')(x)
    
    # Convolutional Block 4
    x = layers.Conv2D(256, (3, 3), activation='relu', padding='same', name='conv2d_3')(x)
    x = layers.MaxPooling2D((2, 1), name='max_pooling2d_2')(x)
    
    # Convolutional Block 5
    x = layers.Conv2D(512, (3, 3), activation='relu', padding='same', name='conv2d_4')(x)
    x = layers.BatchNormalization(name='batch_normalization')(x)
    
    # Convolutional Block 6
    x = layers.Conv2D(512, (3, 3), activation='relu', padding='same', name='conv2d_5')(x)
    x = layers.BatchNormalization(name='batch_normalization_1')(x)
    x = layers.MaxPooling2D((2, 1), name='max_pooling2d_3')(x)
    
    # Convolutional Block 7
    x = layers.Conv2D(512, (2, 2), activation='relu', name='conv2d_6')(x)
    
    # Reshape for LSTM - use Reshape instead of Lambda to avoid serialization issues
    x = layers.Reshape(target_shape=(x.shape[2], x.shape[3]), name='reshape')(x)
    
    # Bidirectional LSTM layers
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True, dropout=0.2), 
                            name='bidirectional')(x)
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True, dropout=0.2), 
                            name='bidirectional_1')(x)
    
    # Dense output layer
    output = layers.Dense(num_classes, activation='softmax', name='dense')(x)
    
    # Create model
    model = Model(inputs=input_layer, outputs=output, name='CRNN_HTR')
    
    return model

print("✓ Model architecture function defined")

# Configure TensorFlow memory settings
import tensorflow as tf
tf.config.set_soft_device_placement(True)

# Limit GPU memory growth if GPU is available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✓ GPU memory growth enabled for {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"GPU configuration note: {e}")

# Build the model first (simpler approach)
print("\nBuilding OCR model architecture...")
ocr_model = build_crnn_model(input_shape=(32, 128, 1), num_classes=encoder.vocab_size)
print("✓ Model architecture built")

# Now try to load weights
ocr_model_path = parent_dir / 'ocr_weights' / 'htr_model_20251020_084444_base.h5'

print(f"\nLoading weights from: {ocr_model_path.name}")

try:
    # Simple approach: just load weights with skip_mismatch
    ocr_model.load_weights(str(ocr_model_path), skip_mismatch=True, by_name=False)
    print("✓ Weights loaded successfully")
except Exception as e:
    print(f"⚠ Weight loading issue: {str(e)[:150]}")
    print("\nNote: Model will use random initialization.")
    print("For best results, ensure the weight file matches the model architecture.")

# Display model info
print(f"\n{'='*60}")
print("OCR Model Information:")
print(f"{'='*60}")
print(f"  Input shape:  {ocr_model.input_shape}")
print(f"  Output shape: {ocr_model.output_shape}")
print(f"  Parameters:   {ocr_model.count_params():,}")
print(f"{'='*60}")
print("✓ OCR model ready for inference")

✓ Model architecture function defined

Building OCR model architecture...
✓ Model architecture built

Loading weights from: htr_model_20251020_084444_base.h5
✓ Weights loaded successfully

OCR Model Information:
  Input shape:  (None, 32, 128, 1)
  Output shape: (None, 31, 70)
  Parameters:   8,738,630
✓ OCR model ready for inference


  _set_weights(
  _set_weights(


## Step 3: Load GPT-2 Model for Next Word Prediction

We'll load the fine-tuned GPT-2 model from the checkpoint.

In [7]:
# Import GPT-2 model and tokenizer
from model import GPT, GPTConfig
import tiktoken

# Set device
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = 'mps'
print(f"Using device: {device}")

Using device: mps


In [8]:
# Load GPT-2 from pretrained weights (using TensorFlow checkpoint)
import json

# Read hparams
hparams_path = parent_dir / 'gpt-2-124M_checkpoints' / 'hparams.json'
with open(hparams_path, 'r') as f:
    hparams = json.load(f)

print("GPT-2 Hyperparameters:")
for key, value in hparams.items():
    print(f"  {key}: {value}")

# Create GPT config
gpt_config = GPTConfig(
    context_length=hparams['n_ctx'],
    vocab_size=hparams['n_vocab'],
    num_layers=hparams['n_layer'],
    embd_size=hparams['n_embd'],
    num_heads=hparams['n_head']
)

print("\n✓ GPT-2 config created")

GPT-2 Hyperparameters:
  n_vocab: 50257
  n_ctx: 1024
  n_embd: 768
  n_head: 12
  n_layer: 12

✓ GPT-2 config created


In [11]:
# Load GPT-2 model from pretrained weights
print("Loading GPT-2 model...")
gpt_model = GPT.from_pretrained('gpt2')
gpt_model = gpt_model.to(device)
gpt_model.eval()

# Load tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

print("✓ GPT-2 model loaded successfully")
print(f"  Config: {gpt_config.num_layers} layers, {gpt_config.embd_size} embedding size")
print(f"  Vocab size: {gpt_config.vocab_size}")

Loading GPT-2 model...


  from .autonotebook import tqdm as notebook_tqdm


loading weights from pretrained gpt: gpt2
✓ GPT-2 model loaded successfully
  Config: 12 layers, 768 embedding size
  Vocab size: 50257
✓ GPT-2 model loaded successfully
  Config: 12 layers, 768 embedding size
  Vocab size: 50257


## Step 4: Define Helper Functions

These functions will handle image preprocessing and text generation.

In [12]:
# OCR Preprocessing Functions
def preprocess_word_for_ocr(word_image, target_height=32, target_width=128):
    """Preprocess a word image for OCR recognition"""
    # Convert to grayscale if needed
    if len(word_image.shape) == 3:
        gray = cv2.cvtColor(word_image, cv2.COLOR_BGR2GRAY)
    else:
        gray = word_image
    
    # Apply Otsu thresholding
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Resize with padding
    h, w = binary.shape[:2]
    scale = min(target_height / h, target_width / w)
    new_h, new_w = int(h * scale), int(w * scale)
    resized = cv2.resize(binary, (new_w, new_h), interpolation=cv2.INTER_AREA)
    
    # Create padded image
    padded = np.ones((target_height, target_width), dtype=np.uint8) * 255
    y_offset = (target_height - new_h) // 2
    x_offset = (target_width - new_w) // 2
    padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
    
    # Normalize
    normalized = padded.astype(np.float32) / 255.0
    
    # Add channel dimension
    preprocessed = np.expand_dims(normalized, axis=-1)
    
    return preprocessed

def decode_ocr_predictions(predictions, encoder):
    """Decode CTC predictions to text"""
    batch_size = predictions.shape[0]
    time_steps = predictions.shape[1]
    
    # Create input_length for all samples
    input_lengths = np.full((batch_size,), time_steps, dtype=np.int32)
    
    # Decode using CTC
    decoded, _ = tf.keras.backend.ctc_decode(
        predictions,
        input_length=input_lengths,
        greedy=True
    )
    
    # Convert to text
    decoded_texts = []
    decoded = decoded[0].numpy()
    for i in range(batch_size):
        seq = decoded[i]
        text = encoder.decode(seq)
        decoded_texts.append(text)
    
    return decoded_texts

print("✓ OCR helper functions defined")

✓ OCR helper functions defined


In [13]:
# GPT-2 Generation Function
def generate_next_words(prompt, num_predictions=3, max_new_tokens=5, temperature=1.0, top_k=50):
    """Generate next word predictions using GPT-2"""
    gpt_model.eval()
    
    # Encode prompt
    tokens = tokenizer.encode(prompt)
    idx = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
    
    # Clamp to context length
    context_len = gpt_model.config.context_length
    if idx.shape[1] > context_len:
        idx = idx[:, -context_len:]
    
    # Generate
    with torch.no_grad():
        logits, _ = gpt_model(idx)
        logits = logits[:, -1, :]  # Get last token logits
        
        if temperature != 1.0:
            logits = logits / max(temperature, 1e-5)
        
        # Get top-k predictions
        topk_vals, topk_idx = torch.topk(logits, k=min(top_k, logits.shape[-1]), dim=-1)
        probs = F.softmax(topk_vals, dim=-1)
        
        # Get top N predictions
        top_n_probs, top_n_indices = torch.topk(probs, k=min(num_predictions, probs.shape[-1]), dim=-1)
        top_n_tokens = torch.gather(topk_idx, -1, top_n_indices)
        
        # Decode tokens
        predictions = []
        for i in range(top_n_tokens.shape[1]):
            token_id = top_n_tokens[0, i].item()
            token_text = tokenizer.decode([token_id])
            prob = top_n_probs[0, i].item()
            predictions.append((token_text, prob))
    
    return predictions

print("✓ GPT-2 generation function defined")

✓ GPT-2 generation function defined
