# **Import Libraries**

In [None]:
import os
import json
import shutil
from tqdm import tqdm
import yaml

# **Define all the file paths**

In [None]:
# Dataset path

# Dataset in the same directory
current_dir = os.getcwd()
DATASET_ROOT = os.path.join(current_dir, "ZJU-Leaper")

# JSON file
GROUP_JSON_FILE = "group1.json"

# Labels folder path
SOURCE_LABELS_DIR = os.path.join(DATASET_ROOT, "Label")

# Define images dir
SOURCE_IMAGES_DIR = os.path.join(DATASET_ROOT, "images")

# Define new dataset dir
NEW_DATASET_DIR = os.path.join(DATASET_ROOT, "fabric_dataset")

# Define the train/val dir
TRAIN_IMG_DIR = os.path.join(NEW_DATASET_DIR, "images", "train")
TRAIN_LBL_DIR = os.path.join(NEW_DATASET_DIR, "labels", "train")
VAL_IMG_DIR = os.path.join(NEW_DATASET_DIR, "images", "val")
VAL_LBL_DIR = os.path.join(NEW_DATASET_DIR, "labels", "val")

# Define YAML file
YAML_FILE_NAME = "fabric_data.yaml"
YAML_FILE_PATH = os.path.join(NEW_DATASET_DIR, YAML_FILE_NAME)
CLASS_NAMES = { 0: "Defect" }

# Create all the new directories
os.makedirs(TRAIN_IMG_DIR, exist_ok=True)
os.makedirs(TRAIN_LBL_DIR, exist_ok=True)
os.makedirs(VAL_IMG_DIR, exist_ok=True)
os.makedirs(VAL_LBL_DIR, exist_ok=True)

print(f"Created directory structure at: {NEW_DATASET_DIR}")
print("Ready to proceed.")

# **Create the YAML File**

In [None]:
# Create the data dictionary for the YAML file
data_yaml = {
    'path': os.path.abspath(NEW_DATASET_DIR),
    'train': 'images/train',
    'val': 'images/val',
    'names': CLASS_NAMES
}

# Write the dictionary
try:
    with open(YAML_FILE_PATH, 'w') as f:
        yaml.dump(data_yaml, f, sort_keys=False)
    print(f"Successfully created {YAML_FILE_NAME} at:")
    print(YAML_FILE_PATH)

    print("\n--- YAML File Content ---")
    print(yaml.dump(data_yaml, sort_keys=False))

except Exception as e:
    print(f"ERROR: Could not write YAML file. {e}")

# **Data Splitting**

In [None]:
def copy_files(file_stems, dest_img_dir, dest_lbl_dir):
    """Helper function to copy images and labels."""
    copy_count = 0
    skip_count = 0
    for stem in tqdm(file_stems, desc=f"Copying to {dest_img_dir}"):

        # Define source paths
        img_name = f"{stem}.jpg"
        lbl_name = f"{stem}.txt"

        src_img_path = os.path.join(SOURCE_IMAGES_DIR, img_name)
        src_lbl_path = os.path.join(SOURCE_LABELS_DIR, lbl_name)

        # Define destination paths
        dest_img_path = os.path.join(dest_img_dir, img_name)
        dest_lbl_path = os.path.join(dest_lbl_dir, lbl_name)

        # Check if both files exist before copying
        if os.path.exists(src_img_path) and os.path.exists(src_lbl_path):
            shutil.copy(src_img_path, dest_img_path)
            shutil.copy(src_lbl_path, dest_lbl_path)
            copy_count += 1
        else:
            skip_count += 1

    return copy_count, skip_count

# Splitting
print("Starting file splitting...")
json_path = os.path.join(DATASET_ROOT, 'ImageSets', 'Groups', GROUP_JSON_FILE)

try:
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Get the 'train' files
    train_stems = data['normal']['train'] + data['defect']['train']

    # Get the 'test' files
    val_stems = data['normal']['test']
    if 'test' in data['defect']:
        val_stems += data['defect']['test']

    print(f"Loaded {len(train_stems)} files for training.")
    print(f"Loaded {len(val_stems)} files for validation.")

    # Copy files
    train_copied, train_skipped = copy_files(train_stems, TRAIN_IMG_DIR, TRAIN_LBL_DIR)
    val_copied, val_skipped = copy_files(val_stems, VAL_IMG_DIR, VAL_LBL_DIR)

    print("\n--- Splitting Complete ---")
    print(f"Training files: {train_copied} copied, {train_skipped} skipped.")
    print(f"Validation files: {val_copied} copied, {val_skipped} skipped.")

except Exception as e:
    print(f"ERROR: Could not read or process {GROUP_JSON_FILE}. {e}")