This Notebook is intended to create the tfRecord file of the dataset so that It prevents the preprocessing every time and saves time and memory consumption.

# Imports

In [None]:
"""
    This will create a ground truth file that I will use to
    feed the ground truths to my model.

    I will get the images and labels as a tf record file.
"""

import tensorflow as tf
import os
import cv2
import numpy as np
import re
import csv
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pre processing the data

In [None]:

# Function to preprocess the image
# You can normalize the images after they are being loaded from the tf record file.
# image = image / 255.0  # Normalize image to have values between 0 and 1
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Directly read as grayscale
    new_size = (1000, 64)  # width, height
    image = cv2.resize(image, new_size)
    image = np.expand_dims(image, axis=-1)  # Add channel dimension
    return image


# Function to preprocess the text
def preprocess_text(txt_pth, lt_pth):
    english_chars = '[A-Za-z0-9۱۲۳۴۵۶۷۸۹۰]'
    non_joiners = ['آ', 'ا', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'ں', 'و', 'ے', '\"', '،', '(', ')', '؟', '۔', '!', ':']
    extra_char = ['\"', '،', '(', ')', '؟', '۔', '!', ':', 'ء']

    with open(txt_pth, mode='r', encoding='utf-8-sig') as f:
        try:
            text = f.read()
            ligatures = []
            ligatures_return = []

            words = text.split(' ')

            for word in words:
                ligature = ''
                for char in word:
                    if char not in non_joiners:
                        ligature += char
                    else:
                        ligature += char
                        ligatures.append(ligature)
                        ligatures_return.append(ligature)
                        ligature = ''
                if ligature != '':
                    ligatures.append(ligature)
                    ligatures_return.append(ligature)

            lig_list = []
            for ligature in ligatures:
                for char in ligature:
                    result = re.findall(english_chars, char)
                    if result:
                        lig_list.append(char + '_isolated')
                        ligature = ligature.replace(char, '')
                    if char in extra_char:
                        char_index = ligature.index(char)
                        ligature = ligature.replace(char, '')
                if ligature:
                    if len(ligature) == 1:
                        lig_list.append(ligature + '_isolated')
                    else:
                        lig_list.append(ligature[0] + '_initial')
                        for middle in ligature[1:-1]:
                            lig_list.append(middle + '_middle')
                        lig_list.append(ligature[-1] + '_final')

            # Load the label dictionary from the CSV file
            with open(lt_pth, mode='r') as lt_file:
                reader = csv.reader(lt_file)
                label_dict = {row[0]: int(row[1]) for row in reader}

            # Convert the ligatures to labels
            labels = [label_dict.get(lig, 0) for lig in lig_list]

            return labels

        except Exception as e:
            print("Exception occurred")
            print(e)
            return []


# Function to pad labels
def pad_single_label(label, max_label_length, pad_value=999):
    padded_label = pad_sequences([label], maxlen=max_label_length, padding='post', value=pad_value)
    return padded_label[0]


def pad_labels(labels, value=999):
    max_label_length = max(len(label) for label in labels)
    padded_labels = pad_sequences(labels, maxlen=max_label_length, padding='post', value=value)
    return padded_labels


# Create Tf Record File

In [None]:
# Function to parse TFRecord file
def parse_tfrecord_fn(example):
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.VarLenFeature(tf.int64),
        'max_label_length': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, feature_description)
    image = tf.io.decode_jpeg(example['image'], channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)  # Convert image to float32
    label = tf.sparse.to_dense(example['label'])
    max_label_length = example['max_label_length']
    return image, label, max_label_length


# Load TFRecord dataset
def load_tfrecord_dataset(file_path):
    dataset = tf.data.TFRecordDataset(file_path)
    dataset = dataset.map(parse_tfrecord_fn)
    return dataset


# Function to load the maximum sequence length
def load_max_sequence_length(file_path):
    with open(file_path, "r") as f:
        line = f.readline().strip()
        max_sequence_length = int(line.split(": ")[1])
    return max_sequence_length


# Function to display images and print labels
def display_image(image, label, max_label_length):
    image_np = (image.numpy().squeeze() * 255).astype(np.uint8)  # Convert image back to uint8
    label_np = label.numpy()

    print(f"Label: {label_np}")
    print(f"Max Label Length: {max_label_length.numpy()}")

    cv2.imshow('Image', image_np)
    if cv2.waitKey(0) & 0xFF == ord('q'):
        cv2.destroyAllWindows()


# Define paths
# tfrecord_file = '/home/cle-dl-05/Documents/3.PdfOCR/preprocessing_utils/dataset_30k.tfrecord'
# max_label_length_file = '/home/cle-dl-05/Documents/3.PdfOCR/preprocessing_utils/max_label_length.txt'

# # Load TFRecord dataset
# dataset = load_tfrecord_dataset(tfrecord_file)

# # Load maximum sequence length
# max_sequence_length = load_max_sequence_length(max_label_length_file)
# print(f"Max Sequence Length: {max_sequence_length}")

# # Display some images from the first batch
# for image, label, max_label_length in dataset.take(5):
#     display_image(image, label, max_label_length)

# Load the Tf Record File

In [None]:
import tensorflow as tf

# Function to parse TFRecord file
def parse_tfrecord_fn(example):
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.VarLenFeature(tf.int64),
        'max_label_length': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, feature_description)
    image = tf.io.decode_jpeg(example['image'], channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)  # Convert image to float32
    label = tf.sparse.to_dense(example['label'])
    max_label_length = example['max_label_length']
    return image, label, max_label_length

# Load TFRecord file
def load_tfrecord_dataset(file_path):
    dataset = tf.data.TFRecordDataset(file_path)
    dataset = dataset.map(parse_tfrecord_fn)
    return dataset

# Define paths
tfrecord_file = '/home/cle-dl-05/Documents/3.PdfOCR/preprocessing_utils/30k_dataset.tfrecord'

# Load TFRecord dataset
dataset = load_tfrecord_dataset(tfrecord_file)

# Split dataset into train and test
train_size = int(0.8 * tf.data.experimental.cardinality(dataset).numpy())  # Convert to numpy array
test_size = tf.data.experimental.cardinality(dataset).numpy() - train_size
train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size)

# Print sizes of train and test datasets
print("Train Dataset Size:", train_size)
print("Test Dataset Size:", test_size)

# Print a batch from the training data
for image, label, max_label_length in train_dataset.take(1):
    print("Image Shape:", image.shape)
    print("Label:", label)
    print("Max Label Length:", max_label_length)
