# Dataset : IAM Handwriting Database
https://fki.tic.heia-fr.ch/databases/iam-handwriting-database

In [None]:
import os
import tarfile

download_dir = '/content/drive/MyDrive/Text-Recogntion/dataset/'

# Specify the path to the .tar file
tgz_path = os.path.join(download_dir, 'lines.tar')
extract_to = os.path.join(download_dir, 'Dataset-Lines')
# tgz_path = os.path.join(download_dir, 'xml.tgz')
# extract_to = os.path.join(download_dir, 'Dataset-XML')

# Extract the tar file
with tarfile.open(tgz_path, 'r:gz') as tar:
    tar.extractall(path=extract_to)

In [None]:
!ls /content/drive/MyDrive/Text-Recogntion/dataset/Dataset-Lines

a01  a05  b03  c01  c06  d05  e02  f01	f07  g04  h01  h06  j06  k03  l03  m02	n01  n06  p06
a02  a06  b04  c02  d01  d06  e04  f02	g01  g05  h02  h07  j07  k04  l04  m03	n02  p01  r02
a03  b01  b05  c03  d03  d07  e06  f03	g02  g06  h04  j01  k01  k07  l07  m04	n03  p02  r03
a04  b02  b06  c04  d04  e01  e07  f04	g03  g07  h05  j04  k02  l01  m01  m06	n04  p03  r06


# Parse

In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def parse_single_xml(xml_path, image_folder):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    data = []

    for line in root.findall(".//line"):
        text = line.attrib.get("text")  # Extract label
        img_name = line.attrib.get("id")  # Extract image ID

        if text and img_name:
            folder_name = img_name.split("-")[0]
            subfolder = img_name[:7]
            img_path = os.path.join(image_folder, folder_name, subfolder, f"{img_name}.png")

            if os.path.exists(img_path):
                data.append({"image_path": img_path, "label": text})
            # else:
            #     print(f"Image not found: {img_path}")

    return data

In [None]:
images_folder = "/content/drive/MyDrive/Text-Recogntion/dataset/Dataset-Lines"
xml_folder = "/content/drive/MyDrive/Text-Recogntion/dataset/Dataset-XML"

all_data = []
for xml_file in os.listdir(xml_folder):
    if xml_file.endswith(".xml"):
        xml_path = os.path.join(xml_folder, xml_file)
        data = parse_single_xml(xml_path, image_folder)
        all_data.extend(data)

In [None]:
df = pd.DataFrame(all_data)
print(df.head())

                                          image_path  \
0  /content/drive/MyDrive/Text-Recogntion/dataset...   
1  /content/drive/MyDrive/Text-Recogntion/dataset...   
2  /content/drive/MyDrive/Text-Recogntion/dataset...   
3  /content/drive/MyDrive/Text-Recogntion/dataset...   
4  /content/drive/MyDrive/Text-Recogntion/dataset...   

                                              label  
0  Though they may gather some Left-wing support, a  
1       large majority of Labour M Ps are likely to  
2      turn down the Foot-Griffiths resolution. Mr.  
3           Foot's line will be that as Labour M Ps  
4         opposed the Government Bill which brought  


In [None]:
# Save to CSV for later use
output_csv_path = "/content/drive/MyDrive/Text-Recogntion/dataset/iam_labels.csv"
df.to_csv(output_csv_path, index=False)
print(f"Saved labels and image paths to: {output_csv_path}")

Saved labels and image paths to: /content/drive/MyDrive/Text-Recogntion/dataset/iam_labels.csv


# Split

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Paths
csv_path = "/content/drive/MyDrive/Text-Recogntion/dataset/iam_labels.csv"
output_folder = "/content/drive/MyDrive/Text-Recogntion/dataset/Split"


# df = pd.read_csv(csv_path)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

Dataset splits saved to: /content/drive/MyDrive/Text-Recogntion/dataset/Split
Train size: 8407
Validation size: 1051
Test size: 1051


In [None]:
print('Train :\n', train_df.head())
print('\n\nVal :\n', val_df.head())
print('\n\nTest :\n', test_df.head())

Train :
                                              image_path  \
9964  /content/drive/MyDrive/Text-Recogntion/dataset...   
2531  /content/drive/MyDrive/Text-Recogntion/dataset...   
3320  /content/drive/MyDrive/Text-Recogntion/dataset...   
9461  /content/drive/MyDrive/Text-Recogntion/dataset...   
617   /content/drive/MyDrive/Text-Recogntion/dataset...   

                                                  label  
9964                      him suddenly, making him say:  
2531         worst aspects of the American cinema. From  
3320  Self-control gives us: Freedom to worship God....  
9461         been attending some kind of fancy dress do  
617                                     in Algoma East.  


Val :
                                              image_path  \
7048  /content/drive/MyDrive/Text-Recogntion/dataset...   
3328  /content/drive/MyDrive/Text-Recogntion/dataset...   
4412  /content/drive/MyDrive/Text-Recogntion/dataset...   
248   /content/drive/MyDrive/Text-Recogntio

In [None]:
train_df.to_csv(os.path.join(output_folder, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_folder, "val.csv"), index=False)
test_df.to_csv(os.path.join(output_folder, "test.csv"), index=False)

print(f"Dataset splits saved to: {output_folder}")

Dataset splits saved to: /content/drive/MyDrive/Text-Recogntion/dataset/Split
