Imports & Setup

In [4]:
import os
import pandas as pd
import cv2
import shutil
from pathlib import Path
from PIL import Image
from sklearn.model_selection import train_test_split

# Define robust paths
BASE_DIR = Path("..") # Go up one level from 'notebooks'
RAW_DATA_DIR = BASE_DIR / "data" / "raw"
PROCESSED_DATA_DIR = BASE_DIR / "data" / "processed"

# Create directories if they don't exist
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project Root: {BASE_DIR.resolve()}")

Project Root: C:\Users\yashk\Documents\Projects\RealWorldProblems\EcoVision_AI


Download Dataset

In [5]:
import os
import getpass

# 1. Setup Kaggle Credentials
# When you run this, a box will appear. Paste your username and key.
# If you don't have them: Go to Kaggle -> Settings -> Create New Token -> Open the downloaded .json file
os.environ['KAGGLE_USERNAME'] = getpass.getpass("Enter Kaggle Username: ")
os.environ['KAGGLE_KEY'] = getpass.getpass("Enter Kaggle API Key: ")

# 2. Download Dataset using official Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

dataset_name = 'viratkothari/animal10'
print(f"Downloading {dataset_name}...")

# Download and unzip directly to our raw folder
api.dataset_download_files(dataset_name, path=RAW_DATA_DIR, unzip=True)

print("Download complete!")

# 3. Rename the folder to standardize it
# The download likely creates a folder named 'Animals-10' or similar inside raw
# Let's find it and rename it to 'animal10' for consistency
for item in RAW_DATA_DIR.iterdir():
    if item.is_dir() and "animal" in item.name.lower():
        downloaded_path = item
        print(f"Found dataset at: {downloaded_path}")
        break

Downloading viratkothari/animal10...
Dataset URL: https://www.kaggle.com/datasets/viratkothari/animal10
Download complete!
Found dataset at: ..\data\raw\Animals-10


Cleaning & Integrity Check

In [6]:
def verify_images(data_dir):
    corrupt_count = 0
    valid_count = 0
    
    # Walk through all folders
    for filepath in Path(data_dir).rglob("*"):
        if filepath.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
            try:
                # 1. Try opening with PIL (checks header)
                img = Image.open(filepath) 
                img.verify() 
                
                # 2. Try reading with OpenCV (checks pixel data integrity)
                # Re-open because verify() closes the file
                img_cv = cv2.imread(str(filepath))
                if img_cv is None:
                    raise ValueError("OpenCV could not read")
                
                valid_count += 1
                
            except (IOError, SyntaxError, ValueError) as e:
                print(f"Corrupt file found: {filepath} - {e}")
                os.remove(filepath) # DELETE corrupt file
                corrupt_count += 1

    print(f"Scan Complete. Valid: {valid_count}, Deleted Corrupt: {corrupt_count}")

# Run the cleaning
verify_images(downloaded_path)

Scan Complete. Valid: 26179, Deleted Corrupt: 0


Splitting Data (Train/Val/Test)

In [7]:
def split_data(source_dir, output_dir, split_ratio=(0.8, 0.1, 0.1)):
    # split_ratio = (Train, Validation, Test)
    
    classes = [d.name for d in source_dir.iterdir() if d.is_dir()]
    
    for cls in classes:
        # Get all images for this class
        cls_path = source_dir / cls
        images = [f for f in cls_path.glob('*') if f.suffix.lower() in ['.jpg', '.png', '.jpeg']]
        
        # Split: Train vs (Val + Test)
        train_imgs, temp_imgs = train_test_split(images, train_size=split_ratio[0], random_state=42)
        # Split: Val vs Test
        val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)
        
        # Move files to new structure
        for subset, img_list in zip(['train', 'val', 'test'], [train_imgs, val_imgs, test_imgs]):
            target_dir = output_dir / subset / cls
            target_dir.mkdir(parents=True, exist_ok=True)
            
            for img in img_list:
                # We COPY instead of move, to keep raw data safe as backup
                shutil.copy(img, target_dir / img.name)
                
        print(f"Processed {cls}: {len(train_imgs)} train, {len(val_imgs)} val, {len(test_imgs)} test")

# Execute Split
split_data(downloaded_path, PROCESSED_DATA_DIR)

Processed butterfly: 1689 train, 211 val, 212 test
Processed cat: 1334 train, 167 val, 167 test
Processed chicken: 2478 train, 310 val, 310 test
Processed cow: 1492 train, 187 val, 187 test
Processed dog: 3890 train, 486 val, 487 test
Processed elephant: 1156 train, 145 val, 145 test
Processed horse: 2098 train, 262 val, 263 test
Processed sheep: 1456 train, 182 val, 182 test
Processed spider: 3856 train, 482 val, 483 test
Processed squirrel: 1489 train, 186 val, 187 test
