## Step 1 : Data Generator for Model Training

In [3]:
import numpy as np
import cv2

alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,;:!?\"'()[]{}-"
char_to_index = {char: idx + 1 for idx, char in enumerate(alphabet)}

def encode_text(text):
    return [char_to_index[char] for char in text if char in char_to_index]

def decode_text(encoded_text):
    index_to_char = {v: k for k, v in char_to_index.items()}
    return ''.join([index_to_char[idx] for idx in encoded_text if idx != 0])

In [None]:
from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, dataframe, batch_size, image_size, max_text_length):
        """
        Args:
            dataframe: DataFrame containing 'image_path' and 'text' columns.
            batch_size: Number of samples per batch.
            image_size: Tuple (height, width) for resizing images.
            max_text_length: Maximum length of text labels (padded if shorter).
        """
        self.dataframe = dataframe
        self.batch_size = batch_size
        self.image_size = image_size
        self.max_text_length = max_text_length
        self.indices = np.arange(len(self.dataframe))

    def __len__(self):
        return int(np.ceil(len(self.dataframe) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_data = self.dataframe.iloc[batch_indices]

        images, labels, input_lengths, label_lengths = [], [], [], []

        for _, row in batch_data.iterrows():

            img = cv2.imread(row['image_path'], cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (self.image_size[1], self.image_size[0])) / 255.0
            img = np.expand_dims(img, axis=-1)

            encoded_label = encode_text(row['label'])
            label_length = len(encoded_label)

            images.append(img)
            labels.append(encoded_label)
            input_lengths.append(self.image_size[1] // 4)
            label_lengths.append(label_length)


        labels_padded = np.zeros((len(labels), self.max_text_length), dtype=np.int32)
        for i, label in enumerate(labels):
            labels_padded[i, :len(label)] = label

        return (
            {
                "input_image": np.array(images),
                "input_length": np.array(input_lengths),
                "label": labels_padded,
                "label_length": np.array(label_lengths),
            },
            np.zeros(len(images)),
        )

    def on_epoch_end(self):
        np.random.shuffle(self.indices)


In [None]:
import pandas as pd

train_csv = "/content/drive/MyDrive/MoroccoAI-Hackathon-Project/data/IAM-Dataset/splits/train.csv"
val_csv = "/content/drive/MyDrive/MoroccoAI-Hackathon-Project/data/IAM-Dataset/splits/val.csv"

batch_size = 32
image_size = (128, 32)
max_text_length = 100

train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)

train_gen = DataGenerator(train_df, batch_size, image_size, max_text_length)
val_gen = DataGenerator(val_df, batch_size, image_size, max_text_length)

print(f"Train generator batches: {len(train_gen)}")
print(f"Validation generator batches: {len(val_gen)}")


Train generator batches: 263
Validation generator batches: 33


## Step 2 : Define and Build the Handwritten Text Recognition (HTR) Model


In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Dense, Bidirectional, LSTM, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [1]:
def build_htr_model(image_size, max_text_length, num_classes):
    input_image = Input(shape=(*image_size, 1), name="input_image")
    labels = Input(shape=(max_text_length,), name="label")
    input_length = Input(shape=(1,), name="input_length")
    label_length = Input(shape=(1,), name="label_length")

    x = Conv2D(32, (3, 3), activation="relu", padding="same")(input_image)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = BatchNormalization()(x)

    x = Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = BatchNormalization()(x)

    x = Conv2D(128, (3, 3), activation="relu", padding="same")(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = BatchNormalization()(x)

    new_shape = (image_size[0] // 8, (image_size[1] // 8) * 128)
    x = Reshape(target_shape=new_shape)(x)

    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Dropout(0.25)(x)
    x = Bidirectional(LSTM(256, return_sequences=True))(x)

    x = Dense(num_classes, activation="softmax")(x)

    def ctc_loss(args):
        y_pred, labels, input_length, label_length = args
        return tf.keras.backend.ctc_batch_cost(labels, y_pred, input_length, label_length)

    output = tf.keras.layers.Lambda(ctc_loss, name="ctc_loss")(
        [x, labels, input_length, label_length]
    )

    model = Model(
        inputs=[input_image, labels, input_length, label_length],
        outputs=output,
    )

    model.compile(optimizer=Adam(learning_rate=1e-4),loss=ctc_loss)

    return model


In [7]:
image_size = (128, 32)
max_text_length = 100
num_classes = len(alphabet) + 1

htr_model = build_htr_model(image_size, max_text_length, num_classes)
htr_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_image (InputLayer)    [(None, 128, 32, 1)]         0         []                            
                                                                                                  
 conv2d_3 (Conv2D)           (None, 128, 32, 32)          320       ['input_image[0][0]']         
                                                                                                  
 max_pooling2d_3 (MaxPoolin  (None, 64, 16, 32)           0         ['conv2d_3[0][0]']            
 g2D)                                                                                             
                                                                                                  
 batch_normalization_3 (Bat  (None, 64, 16, 32)           128       ['max_pooling2d_3[0][0]'

## Step 3 : Training the Handwritten Text Recognition Model

In [None]:
train_generator = DataGenerator(
    train_gen,
    batch_size=batch_size,
    image_size=image_size,
    max_text_length=max_text_length
)

val_generator = DataGenerator(
    val_gen,
    batch_size=batch_size,
    image_size=image_size,
    max_text_length=max_text_length
)

In [None]:
steps_per_epoch = len(train_gen)
validation_steps = len(val_gen)

history = htr_model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_gen,
    validation_steps=validation_steps,
    epochs=30
)

In [None]:
htr_model.save('/content/drive/MyDrive/IAM-Dataset/handwritten_text_recognition_model.h5')

import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.show()