# Setup

In [None]:
# Colab/TF setup
import os, sys, zipfile, tarfile, random, math, io
import numpy as np
import tensorflow as tf

print("TF version:", tf.__version__)
device_names = tf.config.list_physical_devices()
print("Devices:", device_names)

TF version: 2.19.0
Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Get IIIT-5K and read annotations

In [None]:
# Download IIIT-5K (official mirror)
!wget -O IIIT5K.tar.gz https://cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz
!mkdir -p IIIT5K && tar -xzf IIIT5K.tar.gz -C IIIT5K --strip-components=1

import scipy.io as sio
import os
import numpy as np

# .mat files are now directly in IIIT5K/
train_mat = sio.loadmat(os.path.join("IIIT5K", "trainCharBound.mat"))
test_mat = sio.loadmat(os.path.join("IIIT5K", "testCharBound.mat"))

def load_split(mat_key, base_dir="IIIT5K"):
    struct = None
    # Find the structured array in the .mat file
    for key in mat_key:
        if isinstance(mat_key[key], np.ndarray) and mat_key[key].dtype.names:
            struct = mat_key[key]
            break
    if struct is None:
        raise ValueError("Structured array not found in .mat file")

    images, labels = [], []
    for item in struct[0]:
        fname = item['ImgName'][0]  # e.g., 'train/123_1.png'
        text = item['chars'][0]     # ground truth word
        images.append(os.path.join(base_dir, fname))
        labels.append(str(text))
    return images, labels

train_images, train_labels = load_split(train_mat, "IIIT5K")
test_images, test_labels = load_split(test_mat, "IIIT5K")

print(len(train_images), "train |", len(test_images), "test")
print(train_images[0], "->", train_labels[0])

--2025-08-29 10:28:43--  https://cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz
Resolving cvit.iiit.ac.in (cvit.iiit.ac.in)... 14.139.82.25
Connecting to cvit.iiit.ac.in (cvit.iiit.ac.in)|14.139.82.25|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://cdn.iiit.ac.in/cdn/cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz [following]
--2025-08-29 10:28:45--  https://cdn.iiit.ac.in/cdn/cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz
Resolving cdn.iiit.ac.in (cdn.iiit.ac.in)... 14.139.82.19
Connecting to cdn.iiit.ac.in (cdn.iiit.ac.in)|14.139.82.19|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 105861943 (101M) [application/octet-stream]
Saving to: ‘IIIT5K.tar.gz’


2025-08-29 10:28:56 (11.3 MB/s) - ‘IIIT5K.tar.gz’ saved [105861943/105861943]

2000 train | 3000 test
IIIT5K/train/1009_2.png -> You


# Vocabulary & preprocessing pipeline

In [None]:
# Build vocabulary from all labels
all_texts = train_labels + test_labels
vocab = sorted(set("".join(all_texts)))
print("Vocab size:", len(vocab))
print("Sample vocab:", vocab[:60])

# Char<->index lookups
char_to_num = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)   # 1..N
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(),
                                           mask_token=None, invert=True)

# Image size & label padding
IMG_H, IMG_W = 32, 128
MAX_LABEL_LEN = max(len(t) for t in all_texts)
PAD_TOKEN = 0  # we'll use 0 as padding id (UNK from StringLookup)

def encode_sample(img_path, label):
    # image
    img = tf.io.read_file(img_path)
    img = tf.io.decode_png(img, channels=1)
    img = tf.image.resize(img, [IMG_H, IMG_W])
    img = tf.cast(img, tf.float32) / 255.0
    # transpose so time dimension = width
    img = tf.transpose(img, [1,0,2])  # (W, H, 1)

    # label -> ids
    chars = tf.strings.unicode_split(label, "UTF-8")
    ids = char_to_num(chars)  # 1..N
    # pad to MAX_LABEL_LEN with 0
    pad = MAX_LABEL_LEN - tf.shape(ids)[0]
    ids = tf.pad(ids, [[0, pad]], constant_values=PAD_TOKEN)
    return (img, ids)

BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

train_ds = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
train_ds = train_ds.shuffle(buffer_size=len(train_images), reshuffle_each_iteration=True)
train_ds = train_ds.map(encode_sample, num_parallel_calls=AUTOTUNE)
train_ds = train_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels))
test_ds = test_ds.map(encode_sample, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)


Vocab size: 62
Sample vocab: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x']


# CRNN model (CNN + BiLSTM) that outputs logits

In [None]:
def build_crnn(num_classes):
    image_in = tf.keras.Input(shape=(IMG_W, IMG_H, 1), name="image")

    x = tf.keras.layers.Conv2D(32, 3, padding="same", activation="relu")(image_in)
    x = tf.keras.layers.MaxPooling2D(2)(x)     # /2
    x = tf.keras.layers.Conv2D(64, 3, padding="same", activation="relu")(x)
    x = tf.keras.layers.MaxPooling2D(2)(x)     # /4 total

    # Now shape is (W/4, H/4, C). Keep time = W/4
    new_w = IMG_W // 4
    new_h = IMG_H // 4
    x = tf.keras.layers.Reshape((new_w, new_h*64))(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True, dropout=0.25))(x)

    # logits (no softmax here; CTC expects logits)
    logits = tf.keras.layers.Dense(num_classes, name="logits")(x)
    return tf.keras.Model(image_in, logits, name="crnn_logits")

NUM_CLASSES = len(char_to_num.get_vocabulary()) + 1  # +1 for CTC blank (last index)
base_model = build_crnn(NUM_CLASSES)
base_model.summary()

# Custom Model with CTC train_step

In [None]:
class CTCModel(tf.keras.Model):
    def __init__(self, logits_model, blank_index=None, **kwargs):
        super().__init__(**kwargs)
        self.logits_model = logits_model
        self.blank_index = blank_index if blank_index is not None else (NUM_CLASSES - 1)
        self.loss_tracker = tf.keras.metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.loss_tracker]

    def train_step(self, data):
        # data = (images, labels_padded)
        images, labels_padded = data

        # label lengths = count of non-zero tokens
        label_lens = tf.reduce_sum(tf.cast(tf.not_equal(labels_padded, PAD_TOKEN), tf.int32), axis=1)

        with tf.GradientTape() as tape:
            logits = self.logits_model(images, training=True)  # (B, T, C)
            # time steps
            logit_lens = tf.fill([tf.shape(logits)[0]], tf.shape(logits)[1])
            # CTC loss (uses logits directly)
            loss = tf.nn.ctc_loss(
                labels=self._dense_to_sparse(labels_padded, label_lens),
                logits=logits,
                label_length=label_lens,
                logit_length=logit_lens,
                logits_time_major=False,
                blank_index=self.blank_index
            )
            loss = tf.reduce_mean(loss)

        grads = tape.gradient(loss, self.logits_model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.logits_model.trainable_variables))
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        images, labels_padded = data
        label_lens = tf.reduce_sum(tf.cast(tf.not_equal(labels_padded, PAD_TOKEN), tf.int32), axis=1)
        logits = self.logits_model(images, training=False)
        logit_lens = tf.fill([tf.shape(logits)[0]], tf.shape(logits)[1])
        loss = tf.nn.ctc_loss(
            labels=self._dense_to_sparse(labels_padded, label_lens),
            logits=logits,
            label_length=label_lens,
            logit_length=logit_lens,
            logits_time_major=False,
            blank_index=self.blank_index
        )
        loss = tf.reduce_mean(loss)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    @staticmethod
    def _dense_to_sparse(dense_labels, label_lens):
        """
        Convert padded dense labels (B, L) to SparseTensor using provided lengths.
        Padding value is 0 and must be excluded.
        """
        batch_size = tf.shape(dense_labels)[0]
        max_len = tf.shape(dense_labels)[1]
        # Build indices for non-pad positions
        mask = tf.sequence_mask(label_lens, max_len)  # (B, L) True for real tokens
        indices = tf.where(mask)
        values = tf.gather_nd(dense_labels, indices)
        sparse = tf.SparseTensor(
            indices=tf.cast(indices, tf.int64),
            values=tf.cast(values, tf.int32),
            dense_shape=tf.cast([batch_size, max_len], tf.int64)
        )
        return sparse

# Wrap and compile
ctc_model = CTCModel(base_model)
ctc_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3))


# Train

In [None]:
EPOCHS = 50
history = ctc_model.fit(
    train_ds,
    validation_data=test_ds.take(1),  # quick sanity
    epochs=EPOCHS,
    verbose=1
)


Epoch 1/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - loss: 41.9588 - val_loss: 21.8490
Epoch 2/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 20.2721 - val_loss: 21.7336
Epoch 3/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 20.5187 - val_loss: 21.4347
Epoch 4/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 20.4471 - val_loss: 21.3372
Epoch 5/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 20.0922 - val_loss: 21.2694
Epoch 6/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 19.8493 - val_loss: 21.0228
Epoch 7/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 19.3314 - val_loss: 20.8666
Epoch 8/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 19.4391 - val_loss: 21.2424
Epoch 9/50
[1m63/63[0m [32m━

# Inference helper (greedy CTC decode) + metrics (Accuracy & CER)

In [None]:
# A separate inference model: image -> logits -> softmax
image_infer = base_model.input
logits_out  = base_model.output
infer_model = tf.keras.Model(image_infer, logits_out)

def greedy_decode(logits, blank_index=NUM_CLASSES-1):
    """
    Greedy CTC decode on numpy logits: argmax per time step, then collapse repeats and drop blanks.
    Returns list[str] of decoded words.
    """
    probs = tf.nn.softmax(logits, axis=-1).numpy()
    argmax = np.argmax(probs, axis=-1)  # (B, T)
    decoded = []
    for seq in argmax:
        prev = None
        result = []
        for idx in seq:
            if idx == blank_index:  # blank
                prev = None
                continue
            if idx == prev:         # collapse repeats
                continue
            result.append(idx)
            prev = idx
        # map ids -> chars
        chars = [num_to_char(tf.constant(i)).numpy().decode("utf-8") for i in result]
        decoded.append("".join(chars))
    return decoded

def ids_to_text_batch(padded_ids):
    texts = []
    for row in padded_ids:
        row = tf.gather(row, tf.where(tf.not_equal(row, PAD_TOKEN)))  # remove pads
        txt = tf.strings.reduce_join(num_to_char(tf.cast(tf.squeeze(row, axis=1), tf.int32)))
        texts.append(txt.numpy().decode("utf-8"))
    return texts

def char_error_rate(pred, truth):
    """Simple CER: normalized Levenshtein distance per pair, averaged."""
    def lev(a,b):
        dp = np.zeros((len(a)+1, len(b)+1), dtype=np.int32)
        dp[:,0] = np.arange(len(a)+1)
        dp[0,:] = np.arange(len(b)+1)
        for i in range(1, len(a)+1):
            for j in range(1, len(b)+1):
                cost = 0 if a[i-1]==b[j-1] else 1
                dp[i,j] = min(dp[i-1,j]+1, dp[i,j-1]+1, dp[i-1,j-1]+cost)
        return dp[len(a),len(b)]
    cers = []
    for p,t in zip(pred, truth):
        denom = max(1, len(t))
        cers.append( lev(p,t) / denom )
    return float(np.mean(cers))

# Evaluate on the full test set
total, correct = 0, 0
all_pred, all_true = [], []

for batch in test_ds:
    imgs, lbls = batch
    logits = infer_model(imgs, training=False)
    pred_texts = greedy_decode(logits)
    true_texts = ids_to_text_batch(lbls)

    all_pred.extend(pred_texts)
    all_true.extend(true_texts)
    for p,t in zip(pred_texts, true_texts):
        correct += int(p == t)
        total += 1

word_acc = 100.0 * correct / total
cer = char_error_rate(all_pred, all_true)
print(f"Test Word Accuracy: {word_acc:.2f}%  |  CER: {cer:.4f}")


Test Word Accuracy: 10.47%  |  CER: 0.5952
