In [7]:
import os
import csv
import random
from PIL import Image

# === Config ===
input_dir = "raw_data"                 # Thư mục ảnh gốc

# Hai đích đầu ra chạy song song
OUTPUT_ROOTS = [
    "input_data",
    "private_test",
]

ROWS = 3
COLS = 5
MAX_IMAGES = 100
RESIZE_WIDTH = 600
RESIZE_HEIGHT = 360

def ensure_dirs():
    """Tạo toàn bộ cây thư mục cho tất cả đích đầu ra."""
    for root in OUTPUT_ROOTS:
        os.makedirs(os.path.join(root, "X_test"), exist_ok=True)         # Ảnh puzzle bị xáo
        os.makedirs(os.path.join(root, "Y_test_images"), exist_ok=True)  # Ảnh gốc sau resize

def slice_image(image_path, rows, cols):
    """Cắt ảnh resize thành rows x cols, trả về list mảnh theo thứ tự chuẩn (o->c) và ảnh đã resize."""
    img = Image.open(image_path).convert("RGB")
    img = img.resize((RESIZE_WIDTH, RESIZE_HEIGHT))

    width, height = img.size
    piece_width = width // cols
    piece_height = height // rows

    pieces = []
    for r in range(rows):
        for c in range(cols):
            left = c * piece_width
            upper = r * piece_height
            right = left + piece_width
            lower = upper + piece_height
            pieces.append(img.crop((left, upper, right, lower)))
    return pieces, img

def create_image_from_pieces(pieces, rows, cols, output_filename):
    """Ghép các mảnh theo list đã cho và lưu ra ảnh."""
    piece_width, piece_height = pieces[0].size
    new_img = Image.new("RGB", (piece_width * cols, piece_height * rows))
    for idx, piece in enumerate(pieces):
        r = idx // cols
        c = idx % cols
        new_img.paste(piece, (c * piece_width, r * piece_height))
    new_img.save(output_filename, "JPEG", quality=95)

def process_single_image(image_path, writers_by_root):
    """
    Xử lý 1 ảnh và ghi kết quả ra TẤT CẢ các đích trong OUTPUT_ROOTS.
    writers_by_root: dict {root: csv_writer}
    """
    base_stem = os.path.splitext(os.path.basename(image_path))[0]
    base_name_jpg = base_stem + ".jpg"
    shuffled_image_name = base_stem + "_shuffled.jpg"

    # 1. Cắt và resize ảnh
    pieces, resized_img = slice_image(image_path, ROWS, COLS)

    # 2. Hoán vị ngẫu nhiên khác với identity
    original_indices = list(range(len(pieces)))
    shuffled_indices = original_indices.copy()
    while True:
        random.shuffle(shuffled_indices)
        if shuffled_indices != original_indices:
            break

    # 3. Tạo ảnh puzzle bị xáo
    shuffled_pieces = [pieces[i] for i in shuffled_indices]

    # 4. Ghi ra TẤT CẢ đích
    for root, writer in writers_by_root.items():
        output_image_dir = os.path.join(root, "X_test")
        output_original_dir = os.path.join(root, "Y_test_images")

        # Lưu ảnh gốc đã resize
        original_save_path = os.path.join(output_original_dir, base_name_jpg)
        resized_img.save(original_save_path, "JPEG", quality=95)

        # Lưu ảnh đã xáo
        output_image_path = os.path.join(output_image_dir, shuffled_image_name)
        create_image_from_pieces(shuffled_pieces, ROWS, COLS, output_image_path)

        # Ghi 1 dòng vào CSV: tên file _shuffled và hoán vị (c->o)
        writer.writerow([shuffled_image_name] + shuffled_indices)

def generate_dataset():
    ensure_dirs()

    image_files = [
        f for f in os.listdir(input_dir)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    ]

    if not image_files:
        print("No images found in raw_data directory!")
        return

    # Lấy ngẫu nhiên tối đa MAX_IMAGES
    if len(image_files) > MAX_IMAGES:
        image_files = random.sample(image_files, MAX_IMAGES)

    print(f"Selected {len(image_files)} images randomly. Processing...")

    # Mở CSV writers cho mỗi ROOT
    writers_by_root = {}
    csv_files_opened = []  # để đóng file sau khi ghi
    try:
        for root in OUTPUT_ROOTS:
            csv_path = os.path.join(root, "Y_test.csv")
            f = open(csv_path, mode="w", newline="")
            csv_files_opened.append(f)
            writer = csv.writer(f)
            header = ["image_filename"] + [f"piece_at_{r}_{c}" for r in range(ROWS) for c in range(COLS)]
            writer.writerow(header)
            writers_by_root[root] = writer

        # Xử lý từng ảnh
        for idx, filename in enumerate(image_files, start=1):
            image_path_full = os.path.join(input_dir, filename)
            print(f"[{idx}/{len(image_files)}] Processing {filename}...")
            process_single_image(image_path_full, writers_by_root)

    finally:
        # Đảm bảo đóng file CSV
        for f in csv_files_opened:
            f.close()

    print("\nDataset generated successfully!")
    for root in OUTPUT_ROOTS:
        print(f"- {root}/Y_test_images: {os.path.join(root, 'Y_test_images')}")
        print(f"- {root}/X_test:        {os.path.join(root, 'X_test')}")
        print(f"- {root}/Y_test.csv:    {os.path.join(root, 'Y_test.csv')}")

if __name__ == "__main__":
    generate_dataset()


Selected 100 images randomly. Processing...
[1/100] Processing Jan_van_Eyck_30.jpg...
[2/100] Processing Joan_Miro_58.jpg...
[3/100] Processing Andy_Warhol_88.jpg...
[4/100] Processing Alfred_Sisley_188.jpg...
[5/100] Processing Edgar_Degas_625.jpg...
[6/100] Processing El_Greco_2.jpg...
[7/100] Processing Francisco_Goya_141.jpg...
[8/100] Processing Giotto_di_Bondone_43.jpg...
[9/100] Processing Henri_de_Toulouse-Lautrec_81.jpg...
[10/100] Processing Frida_Kahlo_7.jpg...
[11/100] Processing Edgar_Degas_477.jpg...
[12/100] Processing Frida_Kahlo_43.jpg...
[13/100] Processing Gustav_Klimt_104.jpg...
[14/100] Processing Edgar_Degas_327.jpg...
[15/100] Processing Henri_Matisse_64.jpg...
[16/100] Processing El_Greco_25.jpg...
[17/100] Processing Joan_Miro_7.jpg...
[18/100] Processing Gustav_Klimt_51.jpg...
[19/100] Processing Hieronymus_Bosch_74.jpg...
[20/100] Processing Henri_de_Toulouse-Lautrec_26.jpg...
[21/100] Processing Diego_Velazquez_113.jpg...
[22/100] Processing Camille_Pissarro