# Project Assignment 4 : Image Classification System for Indian Bird Species

**Name/Group ID:** Berke Yusuf Uğurlu - Yusuf İpek (Group 6)

**Date:** 01.06.2025

**Course:** BBM 409 - Machine Learning Laboratory

## 1. Dataset Setup

The dataset for this project contains twenty-five species of birds in India, with 1,500 samples for each species. In total, there are 37,500 images, each approximately 1 MP. The original samples are split into 1,200 for training and 300 for validation. For this assignment, we need to further split each original validation set into half randomly to create a test set and a new validation set, each comprising 150 samples per species. This will achieve an 80% training, 10% testing, and 10% validation split.

**Dataset Source:** [Kaggle Indian Birds Species Image Classification Dataset](https://www.kaggle.com/datasets/ichhadhari/indian-birds/data) 

In [2]:
# General Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import cv2 # OpenCV for image processing
from PIL import Image # Pillow for image processing
import random
import shutil # For file operations

# Deep Learning Libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.applications import VGG16, ResNet50, EfficientNetB0
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

print("TensorFlow Version:", tf.__version__)
print("NumPy Version:", np.__version__)
print("OpenCV Version:", cv2.__version__)

TensorFlow Version: 2.19.0
NumPy Version: 1.26.4
OpenCV Version: 4.11.0


In [None]:
# --- Dataset Path Configuration ---
BASE_DATA_DIR = "Birds_25"

# Check if the dataset path exists
if not os.path.exists(BASE_DATA_DIR):
    print(f"ERROR: Dataset directory not found at {BASE_DATA_DIR}")
else:
    print(f"Dataset directory found at: {BASE_DATA_DIR}")
    # Example: List species to confirm
    try:
        species_list = sorted([d for d in os.listdir(BASE_DATA_DIR) if os.path.isdir(os.path.join(BASE_DATA_DIR, d))])
        if not species_list: 
            train_path_check = os.path.join(BASE_DATA_DIR, 'train')
            if os.path.exists(train_path_check):
                 species_list = sorted([d for d in os.listdir(train_path_check) if os.path.isdir(os.path.join(train_path_check, d))])
        print(f"Found {len(species_list)} species (folders). First few: {species_list[:5]}")
        if not species_list:
             print("Could not automatically determine species list. Please check dataset structure.")
    except Exception as e:
        print(f"Error accessing dataset subdirectories: {e}")

# Define output directories for our custom splits
OUTPUT_TRAIN_DIR = "./dataset_split/train"
OUTPUT_VAL_DIR = "./dataset_split/validation"
OUTPUT_TEST_DIR = "./dataset_split/test"

# Create these directories if they don't exist
os.makedirs(OUTPUT_TRAIN_DIR, exist_ok=True)
os.makedirs(OUTPUT_VAL_DIR, exist_ok=True)
os.makedirs(OUTPUT_TEST_DIR, exist_ok=True)

ERROR: Dataset directory not found at ./indian-birds/birds


In [None]:
# --- Data Splitting Function ---
def create_splits(base_dir, output_train_dir, output_val_dir, output_test_dir):
    """
    Splits the dataset according to the assignment's 80-10-10 ratio.
    Original structure: dataset/train/<species>/ (1200 images)
                        dataset/valid/<species>/ (300 images)
    New structure:  output_train_dir/<species>/ (1200 images)
                    output_val_dir/<species>/ (150 images)
                    output_test_dir/<species>/ (150 images)
    """
    print("Starting dataset splitting...")
    # Kaggle dataset structure is often /kaggle/input/dataset_name/
    # The provided link has train, test, valid folders at the root of "birds".
    original_train_path = os.path.join(base_dir, "train")
    original_valid_path = os.path.join(base_dir, "valid") 

    if not os.path.exists(original_train_path) or not os.path.exists(original_valid_path):
        print(f"Error: Could not find 'train' or 'valid' subdirectories in {base_dir}.")
        print("Please ensure your BASE_DATA_DIR is set correctly and the dataset has the expected structure.")
        return

    all_species = sorted([d for d in os.listdir(original_train_path) if os.path.isdir(os.path.join(original_train_path, d))])
    if not all_species:
        print(f"No species subdirectories found in {original_train_path}. Splitting cannot proceed.")
        return

    for species_name in all_species:
        print(f"Processing species: {species_name}")

        # Create species directories in output
        os.makedirs(os.path.join(output_train_dir, species_name), exist_ok=True)
        os.makedirs(os.path.join(output_val_dir, species_name), exist_ok=True)
        os.makedirs(os.path.join(output_test_dir, species_name), exist_ok=True)

        # 1. Handle Training Data (1200 samples)
        species_train_path_original = os.path.join(original_train_path, species_name)
        train_images = [f for f in os.listdir(species_train_path_original) if os.path.isfile(os.path.join(species_train_path_original, f))]
        for img_name in train_images:
            shutil.copy(os.path.join(species_train_path_original, img_name),
                        os.path.join(output_train_dir, species_name, img_name))

        # 2. Handle Original Validation Data (300 samples) and split into new validation and test
        species_valid_path_original = os.path.join(original_valid_path, species_name)
        original_val_images = [f for f in os.listdir(species_valid_path_original) if os.path.isfile(os.path.join(species_valid_path_original, f))]
        random.shuffle(original_val_images) # Shuffle for random split

        # Split into 150 for new validation and 150 for test
        new_val_set = original_val_images[:150]
        new_test_set = original_val_images[150:300] # Assuming exactly 300 files

        for img_name in new_val_set:
            shutil.copy(os.path.join(species_valid_path_original, img_name),
                        os.path.join(output_val_dir, species_name, img_name))

        for img_name in new_test_set:
            shutil.copy(os.path.join(species_valid_path_original, img_name),
                        os.path.join(output_test_dir, species_name, img_name))

    print("Dataset splitting completed.")
    print(f"Training data: {OUTPUT_TRAIN_DIR}")
    print(f"Validation data: {OUTPUT_VAL_DIR}")
    print(f"Test data: {OUTPUT_TEST_DIR}")

# --- Check if splitting is already done to avoid re-doing it ---
# A simple check: if the output directories have species subfolders.
# More robust check would be to count files.
# For this script, we'll assume if the first species folder exists in train, it's likely done.
# You might want to clear the output_split folders if you need to re-run.

# Check if base_dir exists and has content before attempting to list species
species_list_for_check = []
if os.path.exists(os.path.join(BASE_DATA_DIR, "train")):
    try:
        species_list_for_check = sorted([d for d in os.listdir(os.path.join(BASE_DATA_DIR, "train")) if os.path.isdir(os.path.join(BASE_DATA_DIR, "train", d))])
    except FileNotFoundError:
        print(f"Warning: Could not list species in {os.path.join(BASE_DATA_DIR, 'train')}. Skipping split check.")

if species_list_for_check and not os.path.exists(os.path.join(OUTPUT_TRAIN_DIR, species_list_for_check[0])):
    if os.path.exists(BASE_DATA_DIR) and os.path.exists(os.path.join(BASE_DATA_DIR, "train")) and os.path.exists(os.path.join(BASE_DATA_DIR, "valid")):
        create_splits(BASE_DATA_DIR, OUTPUT_TRAIN_DIR, OUTPUT_VAL_DIR, OUTPUT_TEST_DIR)
    else:
        print("Original dataset structure ('train', 'valid' folders inside BASE_DATA_DIR) not found. Skipping automatic split.")
        print(f"Please ensure your data is correctly placed in {BASE_DATA_DIR} or manually split into:")
        print(f"  {OUTPUT_TRAIN_DIR}")
        print(f"  {OUTPUT_VAL_DIR}")
        print(f"  {OUTPUT_TEST_DIR}")
else:
    if species_list_for_check:
        print("Split dataset directories seem to exist. Skipping splitting process.")
    else:
        print("Cannot determine if dataset is split as original species list could not be obtained. Please check paths.")

### 1.1. Data Loading and Preprocessing Utilities
We will define functions to load image paths and labels, and a basic image preprocessing function.

In [None]:
IMG_WIDTH = 224
IMG_HEIGHT = 224
IMG_CHANNELS = 3 # For color images

def load_image_paths_and_labels(data_dir):
    image_paths = []
    labels = []
    class_names = sorted(os.listdir(data_dir))
    label_encoder = LabelEncoder()
    label_encoder.fit(class_names) # Fit on all class names

    for class_name in class_names:
        class_dir = os.path.join(data_dir, class_name)
        if os.path.isdir(class_dir):
            for img_name in os.listdir(class_dir):
                img_path = os.path.join(class_dir, img_name)
                image_paths.append(img_path)
                labels.append(class_name)
    
    numeric_labels = label_encoder.transform(labels)
    return image_paths, numeric_labels, label_encoder, class_names

def preprocess_image(image_path, target_size=(IMG_WIDTH, IMG_HEIGHT), normalize=True):
    try:
        img = Image.open(image_path).convert('RGB') # Ensure 3 channels
        img = img.resize(target_size)
        img_array = np.array(img)
        if normalize:
            img_array = img_array / 255.0 # Normalize to [0, 1]
        return img_array
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None # Return None if an image is corrupted or cannot be processed

# Load data for traditional ML parts

print("Loading image paths and labels for train, validation, and test sets...")
try:
    X_train_paths, y_train_raw, label_encoder, class_names = load_image_paths_and_labels(OUTPUT_TRAIN_DIR)
    X_val_paths, y_val_raw, _, _ = load_image_paths_and_labels(OUTPUT_VAL_DIR)
    X_test_paths, y_test_raw, _, _ = load_image_paths_and_labels(OUTPUT_TEST_DIR)

    print(f"Training samples: {len(X_train_paths)}, Validation samples: {len(X_val_paths)}, Test samples: {len(X_test_paths)}")
    print(f"Number of classes: {len(class_names)}")
    print(f"Example class names: {class_names[:5]}")

    # Save the label encoder and class names for later use
    np.save('label_encoder_classes.npy', label_encoder.classes_)

except Exception as e:
    print(f"An error occurred during loading image paths: {e}")
    print("Please ensure the dataset has been split correctly into train, validation, and test directories.")
    X_train_paths, y_train_raw, X_val_paths, y_val_raw, X_test_paths, y_test_raw, class_names = [], [], [], [], [], [], []


# Function to load a subset of data
def load_subset_paths(image_paths, labels, subset_fraction=0.1, random_state=42):
    if subset_fraction >= 1.0:
        return image_paths, labels
    
    # Stratified split to maintain class proportions
    _, sub_paths, _, sub_labels = train_test_split(
        image_paths, labels, 
        test_size=subset_fraction, 
        random_state=random_state, 
        stratify=labels
    )
    print(f"Using a subset of {len(sub_paths)} samples ({subset_fraction*100:.1f}% of original).")
    return sub_paths, sub_labels


USE_SUBSET = False # Set to True to use a subset
SUBSET_FRACTION = 0.1 # 10% of the data

if USE_SUBSET and X_train_paths: # Check if X_train_paths is populated
    print("Loading a subset of the data for development...")
    X_train_paths, y_train_raw = load_subset_paths(X_train_paths, y_train_raw, SUBSET_FRACTION)
    X_val_paths, y_val_raw = load_subset_paths(X_val_paths, y_val_raw, SUBSET_FRACTION)
    X_test_paths, y_test_raw = load_subset_paths(X_test_paths, y_test_raw, SUBSET_FRACTION)
    print(f"Subset - Training samples: {len(X_train_paths)}, Validation samples: {len(X_val_paths)}, Test samples: {len(X_test_paths)}")