In [1]:
import os
import shutil
import random
from pathlib import Path
from typing import List
import re

In [2]:
def combine_datasets(
	real_path: str,
	generated_path: str,
	output_path: str,
	selected_generated_classes: list,
	ratio: float,
	mode: int,
	max_images_per_class: int = None,
	clean_output: bool = False
):
	real_path = Path(real_path)
	generated_path = Path(generated_path)
	output_path = Path(output_path)

	if clean_output and output_path.exists():
		shutil.rmtree(output_path)

	output_path.mkdir(parents=True, exist_ok=True)

	classes = [d.name for d in real_path.iterdir() if d.is_dir()]

	for cls in classes:
		real_cls_path = real_path / cls
		generated_cls_path = generated_path / cls
		out_cls_path = output_path / cls
		out_cls_path.mkdir(parents=True, exist_ok=True)

		real_images = list(real_cls_path.glob('*'))
		generated_images = list(generated_cls_path.glob('*')) if (generated_cls_path.exists() and cls in selected_generated_classes) else []

		if cls in selected_generated_classes:
			if mode == 1:
				num_real = len(real_images)
				num_generated = int(num_real * (1 - ratio) / ratio)
			elif mode == 2:
				if max_images_per_class is None:
					raise ValueError("max_images_per_class must be specified for mode 2.")
				num_real = int(max_images_per_class * ratio)
				num_generated = max_images_per_class - num_real
				real_images = real_images[:num_real]
			generated_images = generated_images[:num_generated]
		else:
			# Non-selected class: use all real images only
			generated_images = []

		selected_images = [(p, 'real') for p in real_images] + [(p, 'generated') for p in generated_images]

		for img_path, origin in selected_images:
			target_path = out_cls_path / f"{origin}_{img_path.name}"
			shutil.copy(img_path, target_path)

		print(f"[{cls}] -> real: {len(real_images)}, generated: {len(generated_images)}")


# Example usage:
# combine_datasets(
# 	real_path='path/to/real',
# 	generated_path='path/to/generated',
# 	output_path='path/to/output',
# 	selected_generated_classes=['cat', 'dog'],
# 	ratio=0.75,
# 	mode=1,
# 	clean_output=True
# )


In [3]:
def process_all_classes(real_root, generated_root, output_root, selected_generated_classes, ratio, mode, max_images_per_class=None, clean_output=False):
	real_root = Path(real_root)
	generated_root = Path(generated_root)
	output_root = Path(output_root)

	for resolution_folder in sorted(real_root.iterdir(), key=lambda x: int(re.search(r'\d+', x.name).group())):
		if not resolution_folder.is_dir():
			continue
		generated_resolution_path = generated_root / resolution_folder.name

		for method_folder in resolution_folder.iterdir():
			if not method_folder.is_dir():
				continue
			generated_method_path = generated_resolution_path / method_folder.name

			real_path = method_folder
			generated_path = generated_method_path
			output_path = output_root / resolution_folder.name / method_folder.name

			# print the current folder being processed
			print(f"Processing: {real_path} -> {generated_path} -> {output_path}")

			combine_datasets(
				real_path=real_path,
				generated_path=generated_path,
				output_path=output_path,
				selected_generated_classes=selected_generated_classes,
				ratio=ratio,
				mode=mode,
				max_images_per_class=max_images_per_class,
				clean_output=False
			)

In [4]:
def process_all_classes2(real_root, generated_root, output_root, selected_generated_classes, ratio, mode, max_images_per_class=None, clean_output=False):
    real_root = Path(real_root)
    generated_root = Path(generated_root)
    output_root = Path(output_root)

    for resolution_folder in sorted(real_root.iterdir(), key=lambda x: int(re.search(r'\d+', x.name).group())):
        if not resolution_folder.is_dir():
            continue
        generated_resolution_path = generated_root / resolution_folder.name

        real_path = resolution_folder
        generated_path = generated_resolution_path
        output_path = output_root / resolution_folder.name

        # print the current folder being processed
        print(f"Processing: {real_path} -> {generated_path} -> {output_path}")

        combine_datasets(
            real_path=real_path,
            generated_path=generated_path,
            output_path=output_path,
            selected_generated_classes=selected_generated_classes,
            ratio=ratio,
            mode=mode,
            max_images_per_class=max_images_per_class,
            clean_output=False
        )


In [10]:
real_path = 'train-raw/arrays'
generated_path = 'test-raw/arrays'
output_path = 'arrays'

In [12]:
process_all_classes(real_path, generated_path, output_path, ['_M_', 'East-African Indian', 'Indo-Oceanic', 'East Asian', 'Euro-American'], 0.5, 2, 100000, True)

Processing: train-raw\arrays\4px\Chargaff-Composante-Diversite -> test-raw\arrays\4px\Chargaff-Composante-Diversite -> arrays\4px\Chargaff-Composante-Diversite
[East Asian] -> real: 1485, generated: 166
[East-African Indian] -> real: 240, generated: 27
[Euro-American] -> real: 3967, generated: 440
[Indo-Oceanic] -> real: 233, generated: 26
[_M_] -> real: 167, generated: 20
Processing: train-raw\arrays\4px\Chargaff-Composante-NucleScore -> test-raw\arrays\4px\Chargaff-Composante-NucleScore -> arrays\4px\Chargaff-Composante-NucleScore
[East Asian] -> real: 1485, generated: 166
[East-African Indian] -> real: 240, generated: 27
[Euro-American] -> real: 3967, generated: 440
[Indo-Oceanic] -> real: 233, generated: 26
[_M_] -> real: 167, generated: 20
Processing: train-raw\arrays\4px\Chargaff-Diversite-NucleScore -> test-raw\arrays\4px\Chargaff-Diversite-NucleScore -> arrays\4px\Chargaff-Diversite-NucleScore
[East Asian] -> real: 1485, generated: 166
[East-African Indian] -> real: 240, genera