In [1]:
import os
import shutil
from tqdm import tqdm

SOURCE_DIR = 'vggface2_full'
DEST_DIR = 'vggface2_raw_300_balanced'
NUM_CLASSES = 300
MIN_IMAGES_PER_IDENTITY = 150
IMAGES_PER_CLASS = 150

os.makedirs(DEST_DIR, exist_ok=True)

qualified_ids = []
for identity in os.listdir(SOURCE_DIR):
    identity_path = os.path.join(SOURCE_DIR, identity)
    if os.path.isdir(identity_path):
        images = [img for img in os.listdir(identity_path) if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
        if len(images) >= MIN_IMAGES_PER_IDENTITY:
            qualified_ids.append((identity, len(images)))

qualified_ids.sort(key=lambda x: x[1], reverse=True)
selected_ids = qualified_ids[:NUM_CLASSES]

print(f"✅ Found {len(selected_ids)} identities with ≥{MIN_IMAGES_PER_IDENTITY} images.")

for identity, _ in tqdm(selected_ids, desc="Copying identities"):
    src = os.path.join(SOURCE_DIR, identity)
    dst = os.path.join(DEST_DIR, identity)
    os.makedirs(dst, exist_ok=True)

    images = [img for img in os.listdir(src) if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
    selected = images[:IMAGES_PER_CLASS]
    for img in selected:
        shutil.copy2(os.path.join(src, img), os.path.join(dst, img))

# Optional: save identity info to file
with open("top_300_identities.txt", "w") as f:
    for identity, count in selected_ids:
        f.write(f"{identity},{count}\n")

print(f"✅ Done: Copied 300 balanced identities to {DEST_DIR}")


✅ Found 300 identities with ≥150 images.


Copying identities: 100%|████████████████████████████████████████████████████████████| 300/300 [00:51<00:00,  5.78it/s]

✅ Done: Copied 300 balanced identities to vggface2_raw_300_balanced



