In [4]:
import os
from PIL import Image
from torchvision import transforms

# Paths
dataset_path = "/kaggle/input/anime-character-classification-2/moeimouto-faces"
output_path = "dataset_balanced/train"
os.makedirs(output_path, exist_ok=True)

# Parameters
target_size = 100     # every class will have at least this many images
img_size = (224, 224)

# Augmentation pipeline (PIL in -> PIL out)
augment = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
    transforms.ToTensor(),          # make it a Tensor
    transforms.ToPILImage()         # back to PIL for saving
])

# Loop through classes
for cls in os.listdir(dataset_path):
    class_folder = os.path.join(dataset_path, cls)
    if not os.path.isdir(class_folder):
        continue

    images = [f for f in os.listdir(class_folder) if f.lower().endswith(".png")]
    count = len(images)

    # Create output class folder
    out_folder = os.path.join(output_path, cls)
    os.makedirs(out_folder, exist_ok=True)

    # Copy original images
    for f in images:
        img_path = os.path.join(class_folder, f)
        Image.open(img_path).save(os.path.join(out_folder, f))

    # Augment if needed
    if count < target_size:
        idx = 0
        while count < target_size:
            img_path = os.path.join(class_folder, images[idx % len(images)])
            img = Image.open(img_path).convert("RGB")
            aug_img = augment(img)  # ✅ now works (PIL -> Tensor -> PIL)

            out_name = f"{cls}_aug_{count}.png"
            aug_img.save(os.path.join(out_folder, out_name))

            count += 1
            idx += 1

        print(f"Class {cls}: {len(images)} → {count} images (augmented)")
    else:
        print(f"Class {cls}: {len(images)} images (no augmentation needed)")

Class 083_shirou_kamui: 63 → 100 images (augmented)
Class 161_ryougi_shiki: 49 → 100 images (augmented)
Class 131_belldandy: 79 → 100 images (augmented)
Class 087_suzumiya_haruka: 83 → 100 images (augmented)
Class 007_nagato_yuki: 105 images (no augmentation needed)
Class 094_fuyou_kaede: 107 images (no augmentation needed)
Class 011_kirisame_marisa: 80 → 100 images (augmented)
Class 091_komaki_manaka: 99 → 100 images (augmented)
Class 153_canal_volphied: 73 → 100 images (augmented)
Class 058_kochiya_sanae: 92 → 100 images (augmented)
Class 065_sanzenin_nagi: 112 images (no augmentation needed)
Class 179_siesta: 70 → 100 images (augmented)
Class 136_shirley_fenette: 55 → 100 images (augmented)
Class 175_saotome_alto: 40 → 100 images (augmented)
Class 103_reinforce_zwei: 61 → 100 images (augmented)
Class 152_maka_albarn: 50 → 100 images (augmented)
Class 020_remilia_scarlet: 88 → 100 images (augmented)
Class 074_daidouji_tomoyo: 118 images (no augmentation needed)
Class 003_fate_testaro

In [6]:
class_counts = {}
for cls in os.listdir(dataset_path):
    cls_path = os.path.join(dataset_path, cls)
    if os.path.isdir(cls_path):
        count = len([f for f in os.listdir(cls_path) if f.lower().endswith(('.jpg','.jpeg','.png'))])
        class_counts[cls] = count

sorted_counts = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)

print("Total classes:", len(sorted_counts))
print("Total images:", sum(c for _, c in sorted_counts))
print("Average images per class:", sum(c for _, c in sorted_counts)//len(sorted_counts))

print("\nTop 10 classes by image count:")
for cls, count in sorted_counts[:10]:
    print(f"{cls}: {count}")

print("\nBottom 10 classes by image count:")
for cls, count in sorted_counts[-10:]:
    print(f"{cls}: {count}")

Total classes: 173
Total images: 14397
Average images per class: 83

Top 10 classes by image count:
037_lala_satalin_deviluke: 161
053_kousaka_tamaki: 161
042_tsukimura_mayu: 151
059_sairenji_haruna: 149
073_subaru_nakajima: 146
001_kinomoto_sakura: 141
064_amami_haruka: 135
068_miyamura_miyako: 135
112_hinamori_amu: 134
072_melon-chan: 130

Bottom 10 classes by image count:
997_ana_coppola: 44
171_ikari_shinji: 43
186_nanael: 41
175_saotome_alto: 40
078_black_rock_shooter: 40
181_allen_walker: 39
998_ito_nobue: 38
188_aika_granzchesta: 37
134_nunnally_lamperouge: 37
084_okazaki_tomoya: 32


In [7]:
print("After augmentation ")
class_counts = {}
for cls in os.listdir("/kaggle/working/dataset_balanced/train"):
    cls_path = os.path.join("/kaggle/working/dataset_balanced/train", cls)
    if os.path.isdir(cls_path):
        count = len([f for f in os.listdir(cls_path) if f.lower().endswith(('.jpg','.jpeg','.png'))])
        class_counts[cls] = count

sorted_counts = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)

print("Total classes:", len(sorted_counts))
print("Total images:", sum(c for _, c in sorted_counts))
print("Average images per class:", sum(c for _, c in sorted_counts)//len(sorted_counts))

print("\nTop 10 classes by image count:")
for cls, count in sorted_counts[:10]:
    print(f"{cls}: {count}")

print("\nBottom 10 classes by image count:")
for cls, count in sorted_counts[-10:]:
    print(f"{cls}: {count}")

After augmentation 
Total classes: 173
Total images: 18154
Average images per class: 104

Top 10 classes by image count:
037_lala_satalin_deviluke: 161
053_kousaka_tamaki: 161
042_tsukimura_mayu: 151
059_sairenji_haruna: 149
073_subaru_nakajima: 146
001_kinomoto_sakura: 141
064_amami_haruka: 135
068_miyamura_miyako: 135
112_hinamori_amu: 134
072_melon-chan: 130

Bottom 10 classes by image count:
121_arcueid_brunestud: 100
041_saigyouji_yuyuko: 100
090_minase_iori: 100
143_miura_azusa: 100
146_shinku: 100
184_suzumiya_akane: 100
012_asahina_mikuru: 100
176_sendou_erika: 100
069_hayase_mitsuki: 100
191_shidou_hikaru: 100
