# Libraries

In [15]:
# Standard Library Imports
import os
import sys
import json
import ssl
import shutil
from pathlib import Path
from collections import defaultdict
from random import sample
from typing import Any

ssl._create_default_https_context = ssl._create_unverified_context  # Ignore SSL certificate verification

# Data Science Libraries
import numpy as np
import pandas as pd

# Visualization Libraries
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots, show
import seaborn as sns

# Deep Learning Libraries - TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras import Sequential, Input, layers, Model
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Flatten, Dropout, Dense,
    BatchNormalization, RandomFlip, RandomRotation,
    RandomZoom, Rescaling
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras import models
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.applications import EfficientNetB3

# Alternative: Standalone Keras Imports
from keras import Model, Sequential, Input
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, LeakyReLU
from keras.optimizers import SGD
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy, AUC, F1Score
from keras.callbacks import ModelCheckpoint, CSVLogger, LearningRateScheduler
from keras.utils import to_categorical
from keras.datasets.cifar10 import load_data as load_cifar10
from keras_tqdm import TQDMCallback


# Deep Learning Libraries - PyTorch and Torchvision
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim



# Import file csv (from Data No Noise)

In [16]:
df = pd.read_csv(r'/Users/ricardokayseller/Desktop/filtered_dataset.csv', sep= ',')

In [17]:
df.head(5)

Unnamed: 0,rare_species_id,eol_content_id,eol_page_id,kingdom,phylum,family,file_path,full_path,label
0,75fd91cb-2881-41cd-88e6-de451e8b60e2,12853737,449393,animalia,mollusca,unionidae,mollusca_unionidae/12853737_449393_eol-full-si...,/Users/ricardokayseller/Desktop/DATA2/mollusca...,unionidae
1,28c508bc-63ff-4e60-9c8f-1934367e1528,20969394,793083,animalia,chordata,geoemydidae,chordata_geoemydidae/20969394_793083_eol-full-...,/Users/ricardokayseller/Desktop/DATA2/chordata...,geoemydidae
2,00372441-588c-4af8-9665-29bee20822c0,28895411,319982,animalia,chordata,cryptobranchidae,chordata_cryptobranchidae/28895411_319982_eol-...,/Users/ricardokayseller/Desktop/DATA2/chordata...,cryptobranchidae
3,94004bff-3a33-4758-8125-bf72e6e57eab,21252576,7250886,animalia,chordata,indriidae,chordata_indriidae/21252576_7250886_eol-full-s...,/Users/ricardokayseller/Desktop/DATA2/chordata...,indriidae
4,dc48f2ce-4feb-4ef7-b2a2-c3c3f42bf19b,28657539,491832,animalia,arthropoda,formicidae,arthropoda_formicidae/28657539_491832_eol-full...,/Users/ricardokayseller/Desktop/DATA2/arthropo...,formicidae


# 1. Update Path

In [18]:
# Update 'full_path' to point to the new directory
df['full_path'] = df['full_path'].str.replace(
    '/Users/ricardokayseller/Desktop/DATA2', 
    '/Users/ricardokayseller/Desktop/DATA_NO_NOISE_RESIZED_AUGMENTATION'
)

In [19]:
df.head(3)

Unnamed: 0,rare_species_id,eol_content_id,eol_page_id,kingdom,phylum,family,file_path,full_path,label
0,75fd91cb-2881-41cd-88e6-de451e8b60e2,12853737,449393,animalia,mollusca,unionidae,mollusca_unionidae/12853737_449393_eol-full-si...,/Users/ricardokayseller/Desktop/DATA_NO_NOISE_...,unionidae
1,28c508bc-63ff-4e60-9c8f-1934367e1528,20969394,793083,animalia,chordata,geoemydidae,chordata_geoemydidae/20969394_793083_eol-full-...,/Users/ricardokayseller/Desktop/DATA_NO_NOISE_...,geoemydidae
2,00372441-588c-4af8-9665-29bee20822c0,28895411,319982,animalia,chordata,cryptobranchidae,chordata_cryptobranchidae/28895411_319982_eol-...,/Users/ricardokayseller/Desktop/DATA_NO_NOISE_...,cryptobranchidae


# 2. Check images in the path

In [20]:
# Path with images
DATA_DIR = "/Users/ricardokayseller/Desktop/DATA_NO_NOISE_RESIZED_AUGMENTATION"

def check_images_existence(df, data_dir):
    missing_files = []

    for path in df['full_path']:
        full_path = os.path.join(data_dir, path)
        if not os.path.exists(full_path):
            missing_files.append(full_path)

    if missing_files:
        print(f"\n {len(missing_files)} no files:\n")
        for file in missing_files:
            print(file)
    else:
        print("\n All images exist in path!")

check_images_existence(df, DATA_DIR)



 All images exist in path!


# 3. Verifying the Target Distribution

In [21]:
# Check the distribution of classes (families) in the dataset
family_counts = df['family'].value_counts()

print(family_counts)


family
dactyloidae        282
salamandridae      256
formicidae         249
cercopithecidae    242
carcharhinidae     242
                  ... 
glareolidae         20
pardalotidae        19
paradisaeidae       19
pristidae           19
scincidae           14
Name: count, Length: 202, dtype: int64


# 4. Data Augmentation for Minor Families

In [12]:
# CONFIG 
DATA_DIR = "/Users/ricardokayseller/Desktop/DATA_NO_NOISE_RESIZED"

# Fill with Mean
TARGET_MEAN = 52

#  Define Augmentations 
augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=25),
    transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02),
])

# Augment Each Minor Family 
print(" Starting augmentation to complete families...")

for family_folder in tqdm(os.listdir(DATA_DIR)):
    family_path = os.path.join(DATA_DIR, family_folder)
    
    if not os.path.isdir(family_path):
        continue

    images = [f for f in os.listdir(family_path) if f.lower().endswith(('jpg', 'jpeg', 'png'))]
    current_count = len(images)

    if current_count >= TARGET_MEAN:
        continue  # Skip major families

    missing = TARGET_MEAN - current_count

    print(f" Family {family_folder}: {current_count} images -> augmenting {missing} new images.")

    for i in range(missing):
        img_name = random.choice(images)
        img_path = os.path.join(family_path, img_name)

        try:
            img = Image.open(img_path).convert('RGB')
            aug_img = augmentation(img)

            new_filename = f"aug_{i}.jpg"
            aug_img.save(os.path.join(family_path, new_filename))

        except Exception as e:
            print(f" Error processing {img_path}: {e}")

print(" Augmentation completed for all minor families!")


 Starting augmentation to complete families...


  0%|▏                                          | 1/203 [00:00<00:25,  7.92it/s]

 Family chordata_balaenidae: 29 images -> augmenting 23 new images.
 Family chordata_pleuronectidae: 27 images -> augmenting 25 new images.


  2%|▊                                          | 4/203 [00:00<00:13, 14.77it/s]

 Family chordata_goodeidae: 30 images -> augmenting 22 new images.
 Family chordata_cervidae: 30 images -> augmenting 22 new images.
 Family arthropoda_papilionidae: 23 images -> augmenting 29 new images.


  6%|██▋                                       | 13/203 [00:00<00:08, 21.21it/s]

 Family chordata_dasypodidae: 29 images -> augmenting 23 new images.
 Family chordata_turdidae: 45 images -> augmenting 7 new images.
 Family chordata_recurvirostridae: 30 images -> augmenting 22 new images.
 Family chordata_tropiduridae: 30 images -> augmenting 22 new images.


  8%|███▎                                      | 16/203 [00:00<00:10, 17.45it/s]

 Family chordata_vombatidae: 28 images -> augmenting 24 new images.
 Family cnidaria_dendrophylliidae: 30 images -> augmenting 22 new images.
 Family chordata_carettochelyidae: 29 images -> augmenting 23 new images.


 10%|████▎                                     | 21/203 [00:01<00:10, 17.70it/s]

 Family chordata_soricidae: 30 images -> augmenting 22 new images.
 Family cnidaria_faviidae: 51 images -> augmenting 1 new images.
 Family chordata_balaenicipitidae: 24 images -> augmenting 28 new images.
 Family chordata_strigopidae: 24 images -> augmenting 28 new images.


 11%|████▊                                     | 23/203 [00:01<00:11, 15.63it/s]

 Family chordata_gliridae: 30 images -> augmenting 22 new images.
 Family chordata_daubentoniidae: 28 images -> augmenting 24 new images.
 Family chordata_paradisaeidae: 19 images -> augmenting 33 new images.


 14%|██████                                    | 29/203 [00:01<00:10, 16.35it/s]

 Family arthropoda_pseudophasmatidae: 28 images -> augmenting 24 new images.
 Family chordata_parulidae: 28 images -> augmenting 24 new images.
 Family chordata_vireonidae: 29 images -> augmenting 23 new images.


 20%|████████▍                                 | 41/203 [00:01<00:05, 27.64it/s]

 Family cnidaria_fungiidae: 30 images -> augmenting 22 new images.
 Family chordata_scolopacidae: 42 images -> augmenting 10 new images.
 Family chordata_equidae: 29 images -> augmenting 23 new images.
 Family chordata_phasianidae: 28 images -> augmenting 24 new images.


 22%|█████████                                 | 44/203 [00:02<00:07, 20.53it/s]

 Family chordata_alcedinidae: 21 images -> augmenting 31 new images.
 Family arthropoda_cerambycidae: 25 images -> augmenting 27 new images.
 Family cnidaria_meandrinidae: 24 images -> augmenting 28 new images.


 23%|█████████▋                                | 47/203 [00:02<00:09, 16.75it/s]

 Family chordata_mimidae: 24 images -> augmenting 28 new images.
 Family chordata_lamnidae: 23 images -> augmenting 29 new images.
 Family chordata_otididae: 30 images -> augmenting 22 new images.


 25%|██████████▌                               | 51/203 [00:02<00:08, 18.35it/s]

 Family chordata_vangidae: 22 images -> augmenting 30 new images.
 Family chordata_cryptobranchidae: 48 images -> augmenting 4 new images.
 Family chordata_salmonidae: 28 images -> augmenting 24 new images.
 Family chordata_rallidae: 27 images -> augmenting 25 new images.


 28%|███████████▌                              | 56/203 [00:03<00:08, 16.46it/s]

 Family chordata_columbidae: 25 images -> augmenting 27 new images.
 Family chordata_gekkonidae: 29 images -> augmenting 23 new images.
 Family arthropoda_coenagrionidae: 30 images -> augmenting 22 new images.


 29%|████████████                              | 58/203 [00:03<00:09, 15.88it/s]

 Family chordata_latimeriidae: 30 images -> augmenting 22 new images.
 Family chordata_pontoporiidae: 25 images -> augmenting 27 new images.
 Family chordata_polyprionidae: 28 images -> augmenting 24 new images.


 31%|█████████████                             | 63/203 [00:03<00:09, 15.29it/s]

 Family chordata_squalidae: 26 images -> augmenting 26 new images.
 Family chordata_caprimulgidae: 27 images -> augmenting 25 new images.
 Family chordata_hominidae: 25 images -> augmenting 27 new images.


 33%|█████████████▊                            | 67/203 [00:03<00:07, 17.70it/s]

 Family chordata_gymnuridae: 27 images -> augmenting 25 new images.
 Family chordata_balaenopteridae: 24 images -> augmenting 28 new images.
 Family chordata_hynobiidae: 30 images -> augmenting 22 new images.


 35%|██████████████▉                           | 72/203 [00:04<00:08, 15.51it/s]

 Family chordata_pristidae: 19 images -> augmenting 33 new images.
 Family chordata_cuculidae: 27 images -> augmenting 25 new images.
 Family cnidaria_siderastreidae: 30 images -> augmenting 22 new images.


 36%|███████████████▎                          | 74/203 [00:04<00:08, 14.88it/s]

 Family mollusca_conidae: 30 images -> augmenting 22 new images.
 Family mollusca_zonitidae: 29 images -> augmenting 23 new images.
 Family chordata_merlucciidae: 23 images -> augmenting 29 new images.


 42%|█████████████████▌                        | 85/203 [00:04<00:05, 23.55it/s]

 Family chordata_squatinidae: 25 images -> augmenting 27 new images.
 Family chordata_bombycillidae: 21 images -> augmenting 31 new images.
 Family chordata_phyllostomidae: 27 images -> augmenting 25 new images.
 Family chordata_scincidae: 14 images -> augmenting 38 new images.
 Family chordata_pittidae: 27 images -> augmenting 25 new images.


 45%|██████████████████▊                       | 91/203 [00:05<00:06, 17.98it/s]

 Family chordata_muscicapidae: 29 images -> augmenting 23 new images.
 Family chordata_hemiscylliidae: 26 images -> augmenting 26 new images.
 Family chordata_aotidae: 23 images -> augmenting 29 new images.


 46%|███████████████████▏                      | 93/203 [00:05<00:06, 16.02it/s]

 Family arthropoda_tettigoniidae: 29 images -> augmenting 23 new images.
 Family chordata_cheirogaleidae: 30 images -> augmenting 22 new images.
 Family chordata_scombridae: 27 images -> augmenting 25 new images.


 48%|████████████████████                      | 97/203 [00:05<00:06, 15.60it/s]

 Family chordata_pardalotidae: 19 images -> augmenting 33 new images.
 Family arthropoda_apidae: 28 images -> augmenting 24 new images.
 Family arthropoda_lucanidae: 25 images -> augmenting 27 new images.


 50%|████████████████████▌                    | 102/203 [00:05<00:06, 15.36it/s]

 Family chordata_sciuridae: 30 images -> augmenting 22 new images.
 Family chordata_hyaenidae: 28 images -> augmenting 24 new images.
 Family arthropoda_nymphalidae: 27 images -> augmenting 25 new images.


 53%|█████████████████████▌                   | 107/203 [00:06<00:05, 16.87it/s]

 Family chordata_brachypteraciidae: 29 images -> augmenting 23 new images.
 Family chordata_somniosidae: 21 images -> augmenting 31 new images.
 Family chordata_strigidae: 26 images -> augmenting 26 new images.


 57%|███████████████████████▍                 | 116/203 [00:06<00:03, 23.54it/s]

 Family chordata_cetorhinidae: 24 images -> augmenting 28 new images.
 Family chordata_falconidae: 44 images -> augmenting 8 new images.
 Family cnidaria_lobophylliidae: 30 images -> augmenting 22 new images.
 Family chordata_dasyuridae: 27 images -> augmenting 25 new images.


 59%|████████████████████████                 | 119/203 [00:06<00:04, 17.71it/s]

 Family chordata_viperidae: 29 images -> augmenting 23 new images.
 Family chordata_indriidae: 30 images -> augmenting 22 new images.
 Family chordata_motacillidae: 28 images -> augmenting 24 new images.


 60%|████████████████████████▋                | 122/203 [00:06<00:04, 17.19it/s]

 Family arthropoda_palinuridae: 27 images -> augmenting 25 new images.
 Family arthropoda_theraphosidae: 29 images -> augmenting 23 new images.
 Family chordata_giraffidae: 28 images -> augmenting 24 new images.


 65%|██████████████████████████▋              | 132/203 [00:07<00:02, 24.28it/s]

 Family chordata_ctenomyidae: 30 images -> augmenting 22 new images.
 Family arthropoda_platystictidae: 30 images -> augmenting 22 new images.
 Family chordata_callorhinchidae: 25 images -> augmenting 27 new images.


 69%|████████████████████████████▎            | 140/203 [00:07<00:02, 27.09it/s]

 Family chordata_pangasiidae: 28 images -> augmenting 24 new images.
 Family chordata_odontophoridae: 30 images -> augmenting 22 new images.
 Family chordata_gavialidae: 29 images -> augmenting 23 new images.


 70%|████████████████████████████▉            | 143/203 [00:07<00:02, 20.65it/s]

 Family chordata_siluridae: 22 images -> augmenting 30 new images.
 Family chordata_psittaculidae: 41 images -> augmenting 11 new images.
 Family chordata_estrildidae: 27 images -> augmenting 25 new images.


 72%|█████████████████████████████▋           | 147/203 [00:07<00:02, 21.27it/s]

 Family cnidaria_diploastraeidae: 30 images -> augmenting 22 new images.
 Family chordata_mesitornithidae: 26 images -> augmenting 26 new images.
 Family cnidaria_helioporidae: 30 images -> augmenting 22 new images.


 76%|███████████████████████████████          | 154/203 [00:08<00:02, 19.94it/s]

 Family chordata_elapidae: 28 images -> augmenting 24 new images.
 Family chordata_cricetidae: 28 images -> augmenting 24 new images.
 Family arthropoda_attelabidae: 29 images -> augmenting 23 new images.


 77%|███████████████████████████████▋         | 157/203 [00:08<00:02, 18.81it/s]

 Family chordata_glareolidae: 20 images -> augmenting 32 new images.
 Family chordata_balistidae: 48 images -> augmenting 4 new images.
 Family chordata_alopiidae: 47 images -> augmenting 5 new images.
 Family arthropoda_triopsidae: 30 images -> augmenting 22 new images.


 80%|████████████████████████████████▋        | 162/203 [00:08<00:02, 16.38it/s]

 Family chordata_rhinodermatidae: 28 images -> augmenting 24 new images.
 Family chordata_cacatuidae: 24 images -> augmenting 28 new images.
 Family cnidaria_pocilloporidae: 51 images -> augmenting 1 new images.
 Family chordata_dendrobatidae: 29 images -> augmenting 23 new images.


 82%|█████████████████████████████████▌       | 166/203 [00:08<00:01, 18.77it/s]

 Family chordata_cyprinodontidae: 30 images -> augmenting 22 new images.
 Family chordata_phyllomedusidae: 28 images -> augmenting 24 new images.
 Family chordata_trionychidae: 27 images -> augmenting 25 new images.


 85%|██████████████████████████████████▉      | 173/203 [00:09<00:01, 19.82it/s]

 Family chordata_arthroleptidae: 25 images -> augmenting 27 new images.
 Family mollusca_haliotidae: 28 images -> augmenting 24 new images.
 Family chordata_emydidae: 51 images -> augmenting 1 new images.
 Family chordata_ramphastidae: 29 images -> augmenting 23 new images.


 88%|███████████████████████████████████▉     | 178/203 [00:09<00:01, 22.08it/s]

 Family chordata_dalatiidae: 25 images -> augmenting 27 new images.
 Family chordata_mantellidae: 28 images -> augmenting 24 new images.
 Family chordata_percidae: 28 images -> augmenting 24 new images.


 91%|█████████████████████████████████████▏   | 184/203 [00:09<00:01, 18.79it/s]

 Family chordata_nesospingidae: 25 images -> augmenting 27 new images.
 Family chordata_chelydridae: 26 images -> augmenting 26 new images.


 93%|█████████████████████████████████████▉   | 188/203 [00:09<00:00, 19.96it/s]

 Family chordata_potoroidae: 29 images -> augmenting 23 new images.
 Family chordata_alligatoridae: 28 images -> augmenting 24 new images.
 Family chordata_podocnemididae: 30 images -> augmenting 22 new images.


 94%|██████████████████████████████████████▌  | 191/203 [00:10<00:00, 18.71it/s]

 Family chordata_sparidae: 47 images -> augmenting 5 new images.
 Family arthropoda_pisauridae: 28 images -> augmenting 24 new images.
 Family chordata_lutjanidae: 29 images -> augmenting 23 new images.
 Family chordata_phrynosomatidae: 28 images -> augmenting 24 new images.


 97%|███████████████████████████████████████▊ | 197/203 [00:10<00:00, 18.71it/s]

 Family chordata_urolophidae: 30 images -> augmenting 22 new images.
 Family chordata_megapodiidae: 24 images -> augmenting 28 new images.
 Family chordata_cheloniidae: 27 images -> augmenting 25 new images.


100%|█████████████████████████████████████████| 203/203 [00:10<00:00, 18.77it/s]

 Family chordata_albulidae: 25 images -> augmenting 27 new images.
 Family chordata_trogonidae: 27 images -> augmenting 25 new images.
 Augmentation completed for all minor families!





## 4.1 Integrating Augmented Images into the Dataset

In [22]:
# Directory path
augmentation_dir = "/Users/ricardokayseller/Desktop/DATA_NO_NOISE_RESIZED_AUGMENTATION"

# Count total number of images
total_images = 0

for family_folder in os.listdir(augmentation_dir):
    family_path = os.path.join(augmentation_dir, family_folder)
    if os.path.isdir(family_path):
        total_images += len([
            img for img in os.listdir(family_path)
            if img.lower().endswith(('.jpg', '.jpeg', '.png'))
        ])

print(f" Total number of all images (real + augmented): {total_images}")


 Total number of all images (real + augmented): 13546


## 4.2 Merge dataset with new Augmented Images

In [9]:
#  Paths 
FINAL_DATA_DIR = "/Users/ricardokayseller/Desktop/DATA_NO_NOISE_RESIZED_AUGMENTATION"

# Initialize list 
augmented_entries = []

#  Process all families 
for family_folder in tqdm(os.listdir(FINAL_DATA_DIR), desc="Processing families"):
    family_path = os.path.join(FINAL_DATA_DIR, family_folder)
    
    if os.path.isdir(family_path):
        try:
            phylum, family = family_folder.split("_", 1)
        except ValueError:
            print(f" Skipping invalid folder: {family_folder}")
            continue
        
        for img_name in os.listdir(family_path):
            img_path = os.path.join(family_path, img_name)
            if img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                
                file_path = f"{family_folder}/{img_name}"
                full_path = img_path
                label = family  # label = family name
                
                # Identify source
                if img_name.startswith("aug_"):
                    source = "augmentation"
                else:
                    source = "real"

                augmented_entries.append({
                    "rare_species_id": None,
                    "eol_content_id": None,
                    "eol_page_id": None,
                    "kingdom": "animalia",
                    "phylum": phylum,
                    "family": family,
                    "file_path": file_path,
                    "full_path": full_path,
                    "source": source,
                    "label": label
                })

# Create final DataFrame 
df_full_aug = pd.DataFrame(augmented_entries)

print(f" Final df_full_aug created with {len(df_full_aug)} samples.")


Processing families: 100%|██████████████████| 203/203 [00:00<00:00, 5371.11it/s]

 Final df_full_aug created with 13546 samples.





## 4.3 Save new csv

In [23]:
# Save df_full_aug to Desktop 
output_path = "/Users/ricardokayseller/Desktop/df_full_aug.csv"

df_full_aug.to_csv(output_path, index=False)

print(f" df_full_aug saved successfully at: {output_path}")


 df_full_aug saved successfully at: /Users/ricardokayseller/Desktop/df_full_aug.csv
