# **Build a Dataset Class for Horse Breeds**

https://www.kaggle.com/datasets/olgabelitskaya/horse-breeds

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("olgabelitskaya/horse-breeds")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/olgabelitskaya/horse-breeds/versions/27


### Split the data into train val, and test set (starified)

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, glob
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader


In [3]:
all_images = []
for ext in ('/**/*.jpg', '/**/*.jpeg', '/**/*.JPG', '/**/*.png'):
    all_images.extend(glob.glob(path + ext, recursive=True))

data = []

for img_path in all_images:
    breed = os.path.basename(os.path.dirname(img_path))
    data.append((img_path, breed))

df = pd.DataFrame(data, columns=['filepath', 'label'])

In [4]:
label_to_idx = {label: i for i, label in enumerate(df['label'].unique())}
df['label_idx'] = df['label'].map(label_to_idx)

In [5]:
train_df, temp_df = train_test_split(
    df, test_size=0.20, stratify=df['label_idx'], random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df['label_idx'], random_state=42
)
print(f"Total images: {len(df)}")
print(f"Train size: {len(train_df)} | Val size: {len(val_df)} | Test size: {len(test_df)}")

Total images: 670
Train size: 536 | Val size: 67 | Test size: 67


### Dataset Class

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class HorseBreedDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['filepath']
        label = self.dataframe.iloc[idx]['label_idx']
        
        image = Image.open(img_path).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
            
        return image, torch.tensor(label, dtype=torch.long)

### Transforms

In [None]:
import torch
from torch.utils.data import DataLoader

def calculate_stats(dataset):
    
    loader = DataLoader(dataset, batch_size=64, shuffle=False)
    
    mean = 0.0
    std = 0.0
    total_images = 0
    
    for images, _ in loader:
        
        batch_samples = images.size(0) 
        images = images.view(batch_samples, images.size(1), -1)
        

        mean += images.mean(2).sum(0)
        std += images.std(2).sum(0)
        total_images += batch_samples

    
    mean /= total_images
    std /= total_images
    
    return mean, std


temp_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

temp_dataset = HorseBreedDataset(df, transform=temp_transforms)
mean, std = calculate_stats(temp_dataset)

print(f"Calculated Mean: {mean}")
print(f"Calculated Std: {std}")

In [None]:
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

### Create Dataloader objects

In [None]:
from torch.utils.data import DataLoader

train_set = HorseBreedDataset(train_df, transform=train_transforms)
val_set = HorseBreedDataset(val_df, transform=val_test_transforms)
test_set = HorseBreedDataset(test_df, transform=val_test_transforms)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

#### Display some images

### Define Model 

### define Loss and Optimizer

#### Build one_epoch_training function loop 

#### Build one_epoch_validation function loop 

### Combine all to train the model
it should Save the best model and track train and val loss and accuracy


### test the model on test set

### show some predictions with the images

### Analyze the results
Is the model overfitting/underfitting?
Plot the training and validation loss/accuracy curves

### Load the model