# Dataset Preparation

## Organize files by writer

In [2]:
import os
import shutil
import xml.etree.ElementTree as ET
import random
import numpy as np
from keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split

In [3]:
# Define paths
data_dir = 'data'
xml_dir = os.path.join(data_dir, 'xml')
lines_dir = os.path.join(data_dir, 'lines')
output_dir = os.path.join(data_dir, 'writers')

os.makedirs(output_dir, exist_ok=True)

In [4]:
# Function to parse XML and get author ID
def get_author_id(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    writer_id = root.attrib.get('writer-id')
    return writer_id

In [5]:
# Loop through subdirectories in lines/
for subdir, _, _ in os.walk(lines_dir):
    for file_name in os.listdir(subdir):
        if file_name.endswith('.png'):
            
            # Construct the corresponding XML file path
            base_name = '-'.join(file_name.split('-')[:-1]) + '.xml'
            xml_file = os.path.join(xml_dir, base_name)
            
            if os.path.exists(xml_file):
                # Get author ID from XML file
                author_id = get_author_id(xml_file)
                
                author_dir = os.path.join(output_dir, author_id)
                os.makedirs(author_dir, exist_ok=True)
                
                src_image_path = os.path.join(subdir, file_name)
                dst_image_path = os.path.join(author_dir, file_name)
                shutil.move(src_image_path, dst_image_path)
                print(f'Moved {src_image_path} to {dst_image_path}')
            else:
                print(f'XML file {xml_file} not found for image {file_name}')

## Set up Inputs for training

In [27]:
# Parameters
img_size = (105, 105)  # Size to resize images to
data_dir = os.path.join('data', 'writers') # Directory with author subdirectories
pairs_per_author = 10  # Number of pairs to generate per author

# Functions
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        if not filename.endswith('.png'):
            continue
        img_path = os.path.join(folder, filename)
        img = load_img(img_path, target_size=img_size, color_mode='grayscale')
        img = img_to_array(img) / 255.0
        images.append(img)
    return images


def create_pairs():
    pairs = []
    labels = []
    authors = os.listdir(data_dir)
    
    for author in authors:
        author_folder = os.path.join(data_dir, author)
        if os.path.isdir(author_folder):
            images = load_images_from_folder(author_folder)
            num_images = len(images)
            
            # Generate positive pairs
            for _ in range(pairs_per_author):
                img1, img2 = random.sample(images, 2)
                pairs.append((img1, img2))
                labels.append(1)
            
            # Generate negative pairs
            for _ in range(pairs_per_author):
                other_author = random.choice([a for a in authors if a != author])
                other_author_folder = os.path.join(data_dir, other_author)
                if not os.path.isdir(other_author_folder):
                    continue
                other_images = load_images_from_folder(other_author_folder)
                img1 = random.choice(images)
                img2 = random.choice(other_images)
                pairs.append((img1, img2))
                labels.append(0)
    
    return pairs, labels

def save_arrays(X_train_1, X_train_2, labels_train, X_val_1, X_val_2, labels_val, prefix='dataset'):
    input_folder = os.path.join('data', 'tf_inputs')
    
    # Ensure the input folder exists
    os.makedirs(input_folder, exist_ok=True)
    
    np.save(os.path.join(input_folder, f'{prefix}_X_train_1.npy'), X_train_1)
    np.save(os.path.join(input_folder, f'{prefix}_X_train_2.npy'), X_train_2)
    np.save(os.path.join(input_folder, f'{prefix}_labels_train.npy'), labels_train)
    np.save(os.path.join(input_folder, f'{prefix}_X_val_1.npy'), X_val_1)
    np.save(os.path.join(input_folder, f'{prefix}_X_val_2.npy'), X_val_2)
    np.save(os.path.join(input_folder, f'{prefix}_labels_val.npy'), labels_val)
    
    print(f'Data saved in "{input_folder}" with prefix "{prefix}"')

In [25]:
pairs, labels = create_pairs()

pairs = np.array(pairs)
labels = np.array(labels)

# Split into training and validation sets
pairs_train, pairs_val, labels_train, labels_val = train_test_split(pairs, labels, test_size=0.2, random_state=42)


X_train_1 = np.array([pair[0] for pair in pairs_train])
X_train_2 = np.array([pair[1] for pair in pairs_train])
X_val_1 = np.array([pair[0] for pair in pairs_val])
X_val_2 = np.array([pair[1] for pair in pairs_val])


print(f'X_train_1 shape: {X_train_1.shape}')
print(f'X_train_2 shape: {X_train_2.shape}')
print(f'labels_train shape: {labels_train.shape}')
print(f'X_val_1 shape: {X_val_1.shape}')
print(f'X_val_2 shape: {X_val_2.shape}')
print(f'labels_val shape: {labels_val.shape}')

# Save data
save_arrays(X_train_1, X_train_2, labels_train, X_val_1, X_val_2, labels_val)


test
Error: The pairs or labels array is empty.
X_train_1 shape: (0,)
X_train_2 shape: (0,)


AttributeError: 'list' object has no attribute 'shape'