In [3]:
import os
import random
import json
from PIL import Image
import numpy as np

def load_line_info(file_path, base_dir):
    line_info = {}
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('#') or line.strip() == '':
                continue
            parts = line.strip().split()
            line_id = parts[0]
            transcription = ' '.join(parts[8:]).replace('|', ' ')
            
            # Construct the image path more flexibly
            form_id = line_id[:3]
            line_group = line_id[:8]
            image_filename = f"{line_id}.png"
            
            # Check for both possible directory structures
            possible_paths = [
                os.path.join(base_dir, "lines", form_id, line_group, image_filename),
                os.path.join(base_dir, "lines", form_id, f"{line_group}-", image_filename)
            ]
            
            image_path = next((path for path in possible_paths if os.path.exists(path)), None)
            
            if image_path:
                line_info[line_id] = {
                    'transcription': transcription,
                    'bounding_box': list(map(int, parts[4:8])),
                    'image_path': image_path
                }
            else:
                print(f"Warning: Image not found for line ID {line_id}")
    
    return line_info

def create_paragraph_image(line_info, num_lines):
    selected_lines = random.sample(list(line_info.keys()), num_lines)
    images = []
    transcriptions = []
    max_width = 0
    total_height = 0
    
    for line_id in selected_lines:
        img_path = line_info[line_id]['image_path']
        try:
            img = Image.open(img_path)
            images.append(img)
            transcriptions.append(line_info[line_id]['transcription'])
            max_width = max(max_width, img.width)
            total_height += img.height
        except Exception as e:
            print(f"Error opening image {img_path}: {e}")
            continue

    if not images:
        raise ValueError("No valid images found to create paragraph")

    # Add some padding
    max_width += 20
    total_height += 20 * (len(images) + 1)

    paragraph_image = Image.new('L', (max_width, total_height), color=255)
    y_offset = 10
    
    for img in images:
        x_offset = random.randint(0, 20)
        paragraph_image.paste(img, (x_offset, y_offset))
        y_offset += img.height + random.randint(5, 15)

    return paragraph_image, '\n'.join(transcriptions)

def generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples):
    line_info = load_line_info(line_info_path, base_dir)
    os.makedirs(output_path, exist_ok=True)
    dataset = []

    for i in range(num_samples):
        num_lines = random.randint(1, 5)
        try:
            image, transcription = create_paragraph_image(line_info, num_lines)
            
            image_filename = f"synthetic_paragraph_{i+1}.png"
            image_path = os.path.join(output_path, image_filename)
            image.save(image_path)
            
            data_entry = {
                "query": "<image>what does this say?",
                "response": transcription,
                "images": [image_path]
            }
            dataset.append(data_entry)
        except Exception as e:
            print(f"Error generating paragraph {i+1}: {e}")
            continue

    return dataset

# Main execution
if __name__ == "__main__":
    base_dir = "C:\\Users\\alexg\\Coding Projects\\Medical-Handwriting-Training-Generator"
    line_info_path = os.path.join(base_dir, "lines.txt")
    output_path = os.path.join(base_dir, "synthetic_paragraphs")
    num_samples = 10000  # Adjust as needed

    dataset = generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples)

    # Save dataset to JSON file
    json_path = os.path.join(base_dir, 'synthetic_paragraph_dataset.json')
    with open(json_path, 'w') as f:
        json.dump(dataset, f, indent=4)

    print(f"{len(dataset)} synthetic paragraph images have been generated and saved in {output_path}.")
    print(f"Dataset information has been saved to {json_path}.")

100 synthetic paragraph images have been generated and saved in C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraphs.
Dataset information has been saved to C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraph_dataset.json.


In [15]:
import os
import random
import json
from PIL import Image
import numpy as np

def load_line_info(file_path, base_dir):
    line_info = {}
    for line in open(file_path, 'r'):
        if line.startswith('#') or line.strip() == '':
            continue
        parts = line.strip().split()
        line_id = parts[0]
        transcription = ' '.join(parts[8:]).replace('|', ' ')
        
        form_id = line_id[:3]
        line_group = line_id[:8]
        image_filename = f"{line_id}.png"
        
        possible_paths = [
            os.path.join(base_dir, "lines", form_id, line_group, image_filename),
            os.path.join(base_dir, "lines", form_id, f"{line_group}-", image_filename)
        ]
        
        image_path = next((path for path in possible_paths if os.path.exists(path)), None)
        
        if image_path:
            try:
                # Attempt to open the image to ensure it's valid
                with Image.open(image_path) as img:
                    line_info[line_id] = {
                        'transcription': transcription,
                        'bounding_box': list(map(int, parts[4:8])),
                        'image_path': image_path
                    }
            except Exception:
                # If the image can't be opened, we simply don't add it to line_info
                pass
    
    return line_info

def create_paragraph_image(line_info, num_lines):
    selected_lines = random.sample(list(line_info.keys()), num_lines)
    images = []
    transcriptions = []
    
    for line_id in selected_lines:
        img_path = line_info[line_id]['image_path']
        img = Image.open(img_path)
        images.append(img)
        transcriptions.append(line_info[line_id]['transcription'])

    max_width = max(img.width for img in images) + 20
    total_height = sum(img.height for img in images) + 20 * (len(images) + 1)

    paragraph_image = Image.new('L', (max_width, total_height), color=255)
    y_offset = 10
    
    for img in images:
        x_offset = random.randint(0, 20)
        paragraph_image.paste(img, (x_offset, y_offset))
        y_offset += img.height + random.randint(5, 15)

    return paragraph_image, '\n'.join(transcriptions)

def generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples):
    line_info = load_line_info(line_info_path, base_dir)
    os.makedirs(output_path, exist_ok=True)
    dataset = []

    print(f"Loaded {len(line_info)} valid line entries.")

    for i in range(num_samples):
        num_lines = random.randint(1, 5)
        image, transcription = create_paragraph_image(line_info, num_lines)
        
        image_filename = f"synthetic_paragraph_{i+1}.png"
        image_path = os.path.join(output_path, image_filename)
        image.save(image_path)
        
        data_entry = {
            "query": "<image>what does this say?",
            "response": transcription,
            "images": [image_path]
        }
        dataset.append(data_entry)

    return dataset

# Main execution
if __name__ == "__main__":
    base_dir = "C:\\Users\\alexg\\Coding Projects\\Medical-Handwriting-Training-Generator"
    line_info_path = os.path.join(base_dir, "lines.txt")
    output_path = os.path.join(base_dir, "synthetic_paragraphs")
    num_samples = 1000  # Adjust as needed

    dataset = generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples)

    print(f"\nGenerated {len(dataset)} synthetic paragraph images.")

    # Save dataset to JSON file
    json_path = os.path.join(base_dir, 'synthetic_paragraph_dataset.json')
    with open(json_path, 'w') as f:
        json.dump(dataset, f, indent=4)

    print(f"\nDataset information has been saved to {json_path}.")

Loaded 2844 valid line entries.

Generated 1000 synthetic paragraph images.

Dataset information has been saved to C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraph_dataset.json.


In [16]:
# BIG FIX
import os
import random
import json
from PIL import Image

def load_line_info(file_path, base_dir):
    line_info = {}
    for line in open(file_path, 'r'):
        if line.startswith('#') or line.strip() == '':
            continue
        parts = line.strip().split()
        line_id = parts[0]
        transcription = ' '.join(parts[8:]).replace('|', ' ')
        
        form_id = line_id[:3]
        line_group = line_id[:8]
        image_filename = f"{line_id}.png"
        
        possible_paths = [
            os.path.join(base_dir, "lines", form_id, line_group, image_filename),
            os.path.join(base_dir, "lines", form_id, f"{line_group}-", image_filename)
        ]
        
        image_path = next((path for path in possible_paths if os.path.exists(path)), None)
        
        if image_path:
            try:
                with Image.open(image_path) as img:
                    line_info[line_id] = {
                        'transcription': transcription,
                        'bounding_box': list(map(int, parts[4:8])),
                        'image_path': image_path
                    }
            except Exception:
                pass
    
    return line_info

def process_paragraph_image(img, canvas_size=(512, 512), target_size=350):
    if img.width > img.height:
        scale_factor = target_size / img.width
    else:
        scale_factor = target_size / img.height
    
    new_size = (int(img.width * scale_factor), int(img.height * scale_factor))
    
    img = img.resize(new_size, Image.Resampling.LANCZOS)
    
    rotation_angle = random.uniform(-20, 20)
    img = img.rotate(rotation_angle, expand=True, fillcolor=255)
    
    canvas = Image.new('L', canvas_size, color=255)
    
    max_x_offset = canvas_size[0] - img.width
    max_y_offset = canvas_size[1] - img.height
    x_offset = random.randint(0, max_x_offset)
    y_offset = random.randint(0, max_y_offset)
    
    canvas.paste(img, (x_offset, y_offset), img if img.mode == 'RGBA' else None)
    
    return canvas

def create_paragraph_image(line_info, num_lines):
    selected_lines = random.sample(list(line_info.keys()), num_lines)
    images = []
    transcriptions = []
    
    for line_id in selected_lines:
        img_path = line_info[line_id]['image_path']
        img = Image.open(img_path)
        images.append(img)
        transcriptions.append(line_info[line_id]['transcription'])

    max_width = max(img.width for img in images) + 20
    total_height = sum(img.height for img in images) + 20 * (len(images) + 1)

    paragraph_image = Image.new('L', (max_width, total_height), color=255)
    y_offset = 10
    
    for img in images:
        x_offset = random.randint(0, 20)
        paragraph_image.paste(img, (x_offset, y_offset))
        y_offset += img.height + random.randint(5, 15)

    return paragraph_image, '\n'.join(transcriptions)

def generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples, transformations_per_image=5):
    line_info = load_line_info(line_info_path, base_dir)
    os.makedirs(output_path, exist_ok=True)
    dataset = []

    print(f"Loaded {len(line_info)} valid line entries.")

    for i in range(num_samples):
        num_lines = random.randint(1, 5)
        original_image, transcription = create_paragraph_image(line_info, num_lines)
        
        for j in range(transformations_per_image):
            image_filename = f"good_paragraph_{i+1}_{j+1}.png"
            image_path = os.path.join(output_path, image_filename)
            
            # Process the original image with augmentation
            processed_image = process_paragraph_image(original_image)
            processed_image.save(image_path)
            
            # Store relative path and ensure consistent format
            image_paths = [f"synthetic_paragraphs/{image_filename}"]
            
            # Create a new data entry for each augmented image
            data_entry = {
                "query": "<image>what does this say?",
                "response": transcription,
                "images": image_paths
            }
            dataset.append(data_entry)

    return dataset

# Main execution
if __name__ == "__main__":
    base_dir = os.getcwd()
    line_info_path = os.path.join(base_dir, "lines.txt")
    output_path = os.path.join(base_dir, "synthetic_paragraphs")
    num_samples = 1000  # Adjust as needed

    dataset = generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples)

    print(f"\nGenerated {len(dataset)} synthetic paragraph images with augmentations.")

    # Save dataset to JSON file
    json_path = os.path.join(base_dir, 'synthetic_notes_dataset.json')
    with open(json_path, 'w') as f:
        json.dump(dataset, f, indent=4)

    print(f"\nDataset information has been saved to {json_path}.")


Loaded 2844 valid line entries.

Generated 5000 synthetic paragraph images with augmentations.

Dataset information has been saved to c:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_notes_dataset.json.


In [17]:
import json
import random

# Load the dataset.json
with open('dataset.json', 'r') as f:
    dataset = json.load(f)

# Load the synthetic_notes_dataset.json
with open('synthetic_notes_dataset.json', 'r') as f:
    synthetic_notes_data = json.load(f)

# Combine the two datasets
combined_dataset = dataset + synthetic_notes_data

# Shuffle the combined dataset to ensure randomness
random.shuffle(combined_dataset)

# Split the combined dataset into 90% train and 10% validation
split_index = int(0.9 * len(combined_dataset))
train_data = combined_dataset[:split_index]
val_data = combined_dataset[split_index:]

# Save the train.json
with open('train.json', 'w') as f:
    json.dump(train_data, f, indent=4)

# Save the val.json
with open('val.json', 'w') as f:
    json.dump(val_data, f, indent=4)

print(f"Train and validation sets created: {len(train_data)} train samples, {len(val_data)} validation samples.")


Train and validation sets created: 11239 train samples, 1249 validation samples.


In [14]:
# mid way try
def process_paragraph_image(img, canvas_size=(512, 512), target_size=350):
    # Determine the scaling factor so that the largest dimension is 350px
    if img.width > img.height:
        scale_factor = target_size / img.width
    else:
        scale_factor = target_size / img.height
    
    # Calculate the new size
    new_size = (int(img.width * scale_factor), int(img.height * scale_factor))
    
    # Resize the image while maintaining aspect ratio
    img = img.resize(new_size, Image.Resampling.LANCZOS)
    
    # Apply random rotation between -20 and +20 degrees
    rotation_angle = random.uniform(-20, 20)
    img = img.rotate(rotation_angle, expand=True, fillcolor=255)
    
    # Create a blank canvas of 512x512 with white background
    canvas = Image.new('L', canvas_size, color=255)
    
    # Generate random position for the image on the canvas
    max_x_offset = canvas_size[0] - img.width
    max_y_offset = canvas_size[1] - img.height
    x_offset = random.randint(0, max_x_offset)
    y_offset = random.randint(0, max_y_offset)
    
    # Paste the image onto the canvas
    canvas.paste(img, (x_offset, y_offset), img if img.mode == 'RGBA' else None)
    
    return canvas

def generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples, transformations_per_image=5):
    line_info = load_line_info(line_info_path, base_dir)
    os.makedirs(output_path, exist_ok=True)
    dataset = []

    print(f"Loaded {len(line_info)} valid line entries.")

    for i in range(num_samples):
        num_lines = random.randint(1, 5)
        original_image, transcription = create_paragraph_image(line_info, num_lines)
        
        image_paths = []
        for j in range(transformations_per_image):
            image_filename = f"good_paragraph_{i+1}_{j+1}.png"
            image_path = os.path.join(output_path, image_filename)
            
            # Process the original image
            processed_image = process_paragraph_image(original_image)
            processed_image.save(image_path)
            
            # Store only the relative path for Linux compatibility
            image_paths.append(f"good_paragraphs/{image_filename}")
        
        data_entry = {
            "query": "<image>what does this say?",
            "response": transcription,
            "images": image_paths
        }
        dataset.append(data_entry)

    return dataset

# Main execution
if __name__ == "__main__":
    # Use current working directory instead of hardcoded path
    base_dir = os.getcwd()
    line_info_path = os.path.join(base_dir, "lines.txt")
    output_path = os.path.join(base_dir, "good_paragraphs")
    num_samples = 1000  # Adjust as needed

    dataset = generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples)

    print(f"\nGenerated {len(dataset)} synthetic paragraph images, each with 5 transformations.")

    # Save dataset to JSON file
    json_path = os.path.join(base_dir, 'synthetic_notes_dataset.json')
    with open(json_path, 'w') as f:
        json.dump(dataset, f, indent=4)

    print(f"\nDataset information has been saved to {json_path}.")

Loaded 2844 valid line entries.

Generated 1000 synthetic paragraph images, each with 5 transformations.

Dataset information has been saved to c:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_notes_dataset.json.


In [7]:
import os
import random
from PIL import Image

def process_paragraph_images(input_dir, output_dir, canvas_size=(512, 512), target_size=350):
    os.makedirs(output_dir, exist_ok=True)

    # Iterate through all images in the input directory
    for image_filename in os.listdir(input_dir):
        if image_filename.endswith('.png'):
            image_path = os.path.join(input_dir, image_filename)
            try:
                # Open image
                img = Image.open(image_path)
                
                # Determine the scaling factor so that the largest dimension is 350px
                if img.width > img.height:
                    scale_factor = target_size / img.width
                else:
                    scale_factor = target_size / img.height
                
                # Calculate the new size
                new_size = (int(img.width * scale_factor), int(img.height * scale_factor))
                
                # Resize the image while maintaining aspect ratio
                img = img.resize(new_size, Image.Resampling.LANCZOS)
                
                # Apply random rotation between -20 and +20 degrees
                rotation_angle = random.uniform(-20, 20)
                img = img.rotate(rotation_angle, expand=True, fillcolor=255)
                
                # Create a blank canvas of 512x512 with white background
                canvas = Image.new('L', canvas_size, color=255)
                
                # Generate random position for the image on the canvas
                max_x_offset = canvas_size[0] - img.width
                max_y_offset = canvas_size[1] - img.height
                x_offset = random.randint(0, max_x_offset)
                y_offset = random.randint(0, max_y_offset)
                
                # Paste the image onto the canvas
                canvas.paste(img, (x_offset, y_offset), img if img.mode == 'RGBA' else None)
                
                # Save the processed image to the new folder
                output_image_path = os.path.join(output_dir, image_filename)
                canvas.save(output_image_path)
                
                print(f"Processed and saved: {output_image_path}")
            except Exception as e:
                print(f"Failed to process {image_filename}: {e}")

# Main execution
if __name__ == "__main__":
    input_dir = "C:\\Users\\alexg\\Coding Projects\\Medical-Handwriting-Training-Generator\\synthetic_paragraphs"
    output_dir = "C:\\Users\\alexg\\Coding Projects\\Medical-Handwriting-Training-Generator\\paragraph_data"
    
    process_paragraph_images(input_dir, output_dir)


Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\paragraph_data\synthetic_paragraph_1.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\paragraph_data\synthetic_paragraph_10.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\paragraph_data\synthetic_paragraph_100.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\paragraph_data\synthetic_paragraph_1000.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\paragraph_data\synthetic_paragraph_101.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\paragraph_data\synthetic_paragraph_102.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\paragraph_data\synthetic_paragraph_103.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Hand

In [9]:
import os
import random
import json
from PIL import Image
import numpy as np

def load_line_info(file_path, base_dir):
    line_info = {}
    for line in open(file_path, 'r'):
        if line.startswith('#') or line.strip() == '':
            continue
        parts = line.strip().split()
        line_id = parts[0]
        transcription = ' '.join(parts[8:]).replace('|', ' ')
        
        form_id = line_id[:3]
        line_group = line_id[:8]
        image_filename = f"{line_id}.png"
        
        possible_paths = [
            os.path.join(base_dir, "lines", form_id, line_group, image_filename),
            os.path.join(base_dir, "lines", form_id, f"{line_group}-", image_filename)
        ]
        
        image_path = next((path for path in possible_paths if os.path.exists(path)), None)
        
        if image_path:
            try:
                # Attempt to open the image to ensure it's valid
                with Image.open(image_path) as img:
                    line_info[line_id] = {
                        'transcription': transcription,
                        'bounding_box': list(map(int, parts[4:8])),
                        'image_path': image_path
                    }
            except Exception:
                # If the image can't be opened, we simply don't add it to line_info
                pass
    
    return line_info

def create_paragraph_image(line_info, num_lines):
    selected_lines = random.sample(list(line_info.keys()), num_lines)
    images = []
    transcriptions = []
    
    for line_id in selected_lines:
        img_path = line_info[line_id]['image_path']
        img = Image.open(img_path)
        images.append(img)
        transcriptions.append(line_info[line_id]['transcription'])

    max_width = max(img.width for img in images) + 20
    total_height = sum(img.height for img in images) + 20 * (len(images) + 1)

    paragraph_image = Image.new('L', (max_width, total_height), color=255)
    y_offset = 10
    
    for img in images:
        x_offset = random.randint(0, 20)
        paragraph_image.paste(img, (x_offset, y_offset))
        y_offset += img.height + random.randint(5, 15)

    return paragraph_image, '\n'.join(transcriptions)

def generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples):
    line_info = load_line_info(line_info_path, base_dir)
    os.makedirs(output_path, exist_ok=True)
    dataset = []

    print(f"Loaded {len(line_info)} valid line entries.")

    for i in range(num_samples):
        num_lines = random.randint(1, 5)
        image, transcription = create_paragraph_image(line_info, num_lines)
        
        image_filename = f"synthetic_paragraph_{i+1}.png"
        image_path = os.path.join(output_path, image_filename)
        image.save(image_path)
        
        data_entry = {
            "query": "<image>what does this say?",
            "response": transcription,
            "images": [image_path]
        }
        dataset.append(data_entry)

    return dataset

def process_paragraph_images(input_dir, output_dir, canvas_size=(512, 512), target_size=350):
    os.makedirs(output_dir, exist_ok=True)

    processed_dataset = []

    # Iterate through all images in the input directory
    for image_filename in os.listdir(input_dir):
        if image_filename.endswith('.png'):
            image_path = os.path.join(input_dir, image_filename)
            try:
                # Open image
                img = Image.open(image_path)
                
                # Determine the scaling factor so that the largest dimension is 350px
                if img.width > img.height:
                    scale_factor = target_size / img.width
                else:
                    scale_factor = target_size / img.height
                
                # Calculate the new size
                new_size = (int(img.width * scale_factor), int(img.height * scale_factor))
                
                # Resize the image while maintaining aspect ratio
                img = img.resize(new_size, Image.Resampling.LANCZOS)
                
                # Apply random rotation between -20 and +20 degrees
                rotation_angle = random.uniform(-20, 20)
                img = img.rotate(rotation_angle, expand=True, fillcolor=255)
                
                # Create a blank canvas of 512x512 with white background
                canvas = Image.new('L', canvas_size, color=255)
                
                # Generate random position for the image on the canvas
                max_x_offset = canvas_size[0] - img.width
                max_y_offset = canvas_size[1] - img.height
                x_offset = random.randint(0, max_x_offset)
                y_offset = random.randint(0, max_y_offset)
                
                # Paste the image onto the canvas
                canvas.paste(img, (x_offset, y_offset), img if img.mode == 'RGBA' else None)
                
                # Save the processed image to the new folder
                output_image_path = os.path.join(output_dir, image_filename)
                canvas.save(output_image_path)
                
                print(f"Processed and saved: {output_image_path}")

                # Update the dataset with the new image path
                for entry in processed_dataset:
                    if entry['images'][0] == image_path:
                        entry['images'][0] = output_image_path
                        break

            except Exception as e:
                print(f"Failed to process {image_filename}: {e}")

    return processed_dataset

# Main execution
if __name__ == "__main__":
    base_dir = "C:\\Users\\alexg\\Coding Projects\\Medical-Handwriting-Training-Generator"
    line_info_path = os.path.join(base_dir, "lines.txt")
    good_paragraphs_path = os.path.join(base_dir, "good_paragraphs")
    synthetic_paragraphs_path = os.path.join(base_dir, "synthetic_paragraphs")
    num_samples = 1000  # Adjust as needed

    # Generate high-quality images
    dataset = generate_synthetic_dataset(line_info_path, base_dir, good_paragraphs_path, num_samples)

    print(f"\nGenerated {len(dataset)} high-quality paragraph images.")

    # Save dataset to JSON file
    json_path = os.path.join(base_dir, 'good_paragraphs_dataset.json')
    with open(json_path, 'w') as f:
        json.dump(dataset, f, indent=4)

    print(f"\nHigh-quality dataset information has been saved to {json_path}.")

    # Process the high-quality images
    processed_dataset = process_paragraph_images(good_paragraphs_path, synthetic_paragraphs_path)

    print(f"\nProcessed {len(processed_dataset)} paragraph images.")

    # Save processed dataset to JSON file
    processed_json_path = os.path.join(base_dir, 'synthetic_paragraph_dataset.json')
    with open(processed_json_path, 'w') as f:
        json.dump(processed_dataset, f, indent=4)

    print(f"\nProcessed dataset information has been saved to {processed_json_path}.")

Loaded 2844 valid line entries.

Generated 1000 high-quality paragraph images.

High-quality dataset information has been saved to C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\good_paragraphs_dataset.json.


TypeError: process_paragraph_images() missing 1 required positional argument: 'original_dataset'

In [10]:
import os
import random
import json
from PIL import Image
import numpy as np

def load_line_info(file_path, base_dir):
    line_info = {}
    for line in open(file_path, 'r'):
        if line.startswith('#') or line.strip() == '':
            continue
        parts = line.strip().split()
        line_id = parts[0]
        transcription = ' '.join(parts[8:]).replace('|', ' ')
        form_id = line_id[:3]
        line_group = line_id[:8]
        image_filename = f"{line_id}.png"
        possible_paths = [
            os.path.join(base_dir, "lines", form_id, line_group, image_filename),
            os.path.join(base_dir, "lines", form_id, f"{line_group}-", image_filename)
        ]
        image_path = next((path for path in possible_paths if os.path.exists(path)), None)
        if image_path:
            try:
                with Image.open(image_path) as img:
                    line_info[line_id] = {
                        'transcription': transcription,
                        'bounding_box': list(map(int, parts[4:8])),
                        'image_path': image_path
                    }
            except Exception:
                pass
    return line_info

def create_paragraph_image(line_info, num_lines):
    selected_lines = random.sample(list(line_info.keys()), num_lines)
    images = []
    transcriptions = []

    for line_id in selected_lines:
        img_path = line_info[line_id]['image_path']
        img = Image.open(img_path)
        images.append(img)
        transcriptions.append(line_info[line_id]['transcription'])

    max_width = max(img.width for img in images) + 20
    total_height = sum(img.height for img in images) + 20 * (len(images) + 1)

    paragraph_image = Image.new('L', (max_width, total_height), color=255)
    y_offset = 10

    for img in images:
        x_offset = random.randint(0, 20)
        paragraph_image.paste(img, (x_offset, y_offset))
        y_offset += img.height + random.randint(5, 15)

    return paragraph_image, '\n'.join(transcriptions)

def generate_synthetic_dataset(line_info_path, base_dir, output_path, num_samples):
    line_info = load_line_info(line_info_path, base_dir)
    os.makedirs(output_path, exist_ok=True)
    dataset = []

    print(f"Loaded {len(line_info)} valid line entries.")

    for i in range(num_samples):
        num_lines = random.randint(1, 5)
        image, transcription = create_paragraph_image(line_info, num_lines)

        image_filename = f"goodparagraph{i+1}.png"
        image_path = os.path.join(output_path, image_filename)
        image.save(image_path)

        data_entry = {
            "query": "<image>what does this say?",
            "response": transcription,
            "images": [image_path]
        }
        dataset.append(data_entry)

    return dataset

def process_paragraph_images(input_dir, output_dir, canvas_size=(512, 512), target_size=350):
    os.makedirs(output_dir, exist_ok=True)

    for image_filename in os.listdir(input_dir):
        if image_filename.endswith('.png'):
            image_path = os.path.join(input_dir, image_filename)
            try:
                img = Image.open(image_path)

                if img.width > img.height:
                    scale_factor = target_size / img.width
                else:
                    scale_factor = target_size / img.height

                new_size = (int(img.width * scale_factor), int(img.height * scale_factor))
                img = img.resize(new_size, Image.Resampling.LANCZOS)

                rotation_angle = random.uniform(-20, 20)
                img = img.rotate(rotation_angle, expand=True, fillcolor=255)

                canvas = Image.new('L', canvas_size, color=255)
                max_x_offset = canvas_size[0] - img.width
                max_y_offset = canvas_size[1] - img.height
                x_offset = random.randint(0, max_x_offset)
                y_offset = random.randint(0, max_y_offset)

                canvas.paste(img, (x_offset, y_offset))

                output_image_path = os.path.join(output_dir, image_filename)
                canvas.save(output_image_path)

                print(f"Processed and saved: {output_image_path}")
            except Exception as e:
                print(f"Failed to process {image_filename}: {e}")

# Main execution
if __name__ == "__main__":
    base_dir = "C:\\Users\\alexg\\Coding Projects\\Medical-Handwriting-Training-Generator"
    line_info_path = os.path.join(base_dir, "lines.txt")
    
    # First step: Generate high-quality images in 'good_paragraphs'
    good_paragraph_output = os.path.join(base_dir, "good_paragraphs")
    num_samples = 1000  # Adjust as needed
    good_dataset = generate_synthetic_dataset(line_info_path, base_dir, good_paragraph_output, num_samples)
    
    good_json_path = os.path.join(base_dir, 'good_paragraph_dataset.json')
    with open(good_json_path, 'w') as f:
        json.dump(good_dataset, f, indent=4)
    
    print(f"\nGenerated {len(good_dataset)} high-quality synthetic paragraph images in 'good_paragraphs'.")
    print(f"\nDataset information has been saved to {good_json_path}.")
    
    # Second step: Process images in 'good_paragraphs' to 'synthetic_paragraphs'
    synthetic_paragraph_output = os.path.join(base_dir, "synthetic_paragraphs")
    process_paragraph_images(good_paragraph_output, synthetic_paragraph_output)
    
    # Optional: Save the dataset again after processing
    synthetic_dataset = [{"images": [os.path.join(synthetic_paragraph_output, img)], "query": "<image>what does this say?"} for img in os.listdir(synthetic_paragraph_output) if img.endswith('.png')]
    synthetic_json_path = os.path.join(base_dir, 'synthetic_paragraph_dataset.json')
    with open(synthetic_json_path, 'w') as f:
        json.dump(synthetic_dataset, f, indent=4)
    
    print(f"\nProcessed images and saved final dataset to {synthetic_json_path}.")


Loaded 2844 valid line entries.

Generated 1000 high-quality synthetic paragraph images in 'good_paragraphs'.

Dataset information has been saved to C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\good_paragraph_dataset.json.
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraphs\goodparagraph1.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraphs\goodparagraph10.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraphs\goodparagraph100.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraphs\goodparagraph1000.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generator\synthetic_paragraphs\goodparagraph101.png
Processed and saved: C:\Users\alexg\Coding Projects\Medical-Handwriting-Training-Generato