### Generate CSV with File Path and Labels

In [8]:
import os
import glob
import random
import pandas as pd

def collect_image_paths(data_dir):
    data = []
    for label in sorted(os.listdir(data_dir)):
        label_path = os.path.join(data_dir, label)
        if not os.path.isdir(label_path):
            continue
        image_files = glob.glob(os.path.join(label_path, "*.*"))
        for file in image_files:
            if file.lower().endswith(('.png', '.jpg', '.bmp')):
                data.append((file, label))
    return data

# Seed for reproducibility
random.seed(42)

# Collect all image paths and labels
train_data = collect_image_paths("data/train")
val_data_full = collect_image_paths("data/val")

# Shuffle val data
random.shuffle(val_data_full)

# Split: 40% val, 60% test
split_idx = int(0.4 * len(val_data_full))
val_data = val_data_full[:split_idx]
test_data = val_data_full[split_idx:]

# Save to CSVs
pd.DataFrame(train_data, columns=["filepath", "label"]).to_csv("train.csv", index=False)
pd.DataFrame(val_data, columns=["filepath", "label"]).to_csv("val.csv", index=False)
pd.DataFrame(test_data, columns=["filepath", "label"]).to_csv("test.csv", index=False)
pd.DataFrame(val_data_full, columns=["filepath", "label"]).to_csv("full_val.csv", index=False)

# Print summary
print("CSV files generated:")
print(f"  • train.csv: {len(train_data)} files")
print(f"  • val.csv:   {len(val_data)} files")
print(f"  • test.csv:  {len(test_data)} files")


CSV files generated:
  • train.csv: 1500 files
  • val.csv:   600 files
  • test.csv:  900 files
