### Manually Splitting IAM Dataset

Create our own train, val, test sets from the IAM dataset.


In [23]:
import random

# Read all the file names.
label_file_names = set()
with open("../../datasets/IAM/all.txt") as f:
    for line in f:
        if line.strip() in label_file_names:
            print(line)
        label_file_names.add(line.strip())

# Shuffle the file names.
label_file_names = list(label_file_names)
random.shuffle(label_file_names)

# Split the file names into sets - 70% train, 10% val 2, 10% val 2, 10% test.
train_file_names = label_file_names[: int(len(label_file_names) * 0.7)]
val_1_file_names = label_file_names[
    int(len(label_file_names) * 0.7) : int(len(label_file_names) * 0.8)
]
val_2_file_names = label_file_names[
    int(len(label_file_names) * 0.8) : int(len(label_file_names) * 0.9)
]
test_file_names = label_file_names[int(len(label_file_names) * 0.9) :]

# Print percentages of each set.
print(
    f"Train: {len(train_file_names) / len(label_file_names) * 100:.2f}% ({len(train_file_names)})"
    f"\nVal 1: {len(val_1_file_names) / len(label_file_names) * 100:.2f}% ({len(val_1_file_names)})"
    f"\nVal 2: {len(val_2_file_names) / len(label_file_names) * 100:.2f}% ({len(val_2_file_names)})"
    f"\nTest: {len(test_file_names) / len(label_file_names) * 100:.2f}% ({len(test_file_names)})"
)

# Write the file names to text files.
with open("../../datasets/IAM/trainset.txt", "w") as f:
    for file_name in train_file_names:
        f.write(file_name + "\n")
with open("../../datasets/IAM/valset1.txt", "w") as f:
    for file_name in val_1_file_names:
        f.write(file_name + "\n")
with open("../../datasets/IAM/valset2.txt", "w") as f:
    for file_name in val_2_file_names:
        f.write(file_name + "\n")
with open("../../datasets/IAM/testset.txt", "w") as f:
    for file_name in test_file_names:
        f.write(file_name + "\n")

Train: 69.95% (1208)
Val 1: 10.02% (173)
Val 2: 10.02% (173)
Test: 10.02% (173)
