## Data Pipeline
- Dowload dataset
- Clean data
- Annotations (if need be)
- Data split

#### 1. Load in data

In [1]:
# load libraries
import os
import shutil
import tarfile
import urllib.request
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorflow as tf



In [2]:
# download data from tensorflow
DATA_DIR = "../data"
os.makedirs(DATA_DIR, exist_ok=True)

# URLs
IMAGES_URL = "http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar"
ANNOT_URL = "http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar"

IMAGE_TAR_PATH = os.path.join(DATA_DIR, "images.tar")
ANNOT_TAR_PATH = os.path.join(DATA_DIR, "annotation.tar")

def download(url, path):
    if os.path.exists(path):
        print(f"{path} already exists, skipping download.")
        return
    print(f"Downloading {url} ...")
    urllib.request.urlretrieve(url, path)
    print("Done!")

download(IMAGES_URL, IMAGE_TAR_PATH)
download(ANNOT_URL, ANNOT_TAR_PATH)

Downloading http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar ...
Done!
Downloading http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar ...
Done!


In [3]:
# extract
def extract(tar_path, extract_to):
    print(f"Extracting {tar_path} ...")
    with tarfile.open(tar_path) as tar:
        tar.extractall(extract_to)
    print("Done!")

extract(IMAGE_TAR_PATH, DATA_DIR)
extract(ANNOT_TAR_PATH, DATA_DIR)

Extracting ../data\images.tar ...
Done!
Extracting ../data\annotation.tar ...
Done!


In [4]:
# ensure clean directory structure
IMAGES_ROOT = os.path.join(DATA_DIR, "Images")

# Verify structure
print("Number of breeds:", len(os.listdir(IMAGES_ROOT)))

Number of breeds: 120


#### 2. Conduct Data Splits 

In [5]:
OUTPUT_DIR = os.path.join(DATA_DIR, "data_splits")
os.makedirs(OUTPUT_DIR, exist_ok=True)

train_dir = os.path.join(OUTPUT_DIR, "train")
val_dir = os.path.join(OUTPUT_DIR, "val")
test_dir = os.path.join(OUTPUT_DIR, "test")

for d in [train_dir, val_dir, test_dir]:
    os.makedirs(d, exist_ok=True)

breeds = os.listdir(IMAGES_ROOT)

for breed in tqdm(breeds, desc="Splitting breeds"):
    full_path = os.path.join(IMAGES_ROOT, breed)
    images = [os.path.join(full_path, f) for f in os.listdir(full_path)]

    # Split
    train_imgs, temp_imgs = train_test_split(images, test_size=0.3, random_state=42)
    val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)

    # Create output folders
    for split_name, split_imgs in zip(["train", "val", "test"], [train_imgs, val_imgs, test_imgs]):
        split_folder = os.path.join(OUTPUT_DIR, split_name, breed)
        os.makedirs(split_folder, exist_ok=True)

        # Copy images
        for img in split_imgs:
            shutil.copy(img, split_folder)


Splitting breeds:   0%|          | 0/120 [00:00<?, ?it/s]

Splitting breeds: 100%|██████████| 120/120 [00:18<00:00,  6.36it/s]


#### 3. Convert to TF Dataset for efficiency

In [6]:
IMG_SIZE = 224
BATCH_SIZE = 32

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    seed=42,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    val_dir,
    seed=42,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    seed=42,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
)


Found 14357 files belonging to 120 classes.
Found 3082 files belonging to 120 classes.
Found 3141 files belonging to 120 classes.


#### 4. Cleaning/Augmentation with Keras

In [7]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1./255),
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1)
])

train_ds = train_ds.map(
    lambda x, y: (data_augmentation(x, training=True), y)
)

# Validation & test only get normalization
norm = tf.keras.layers.Rescaling(1./255)
val_ds = val_ds.map(lambda x, y: (norm(x), y))
test_ds = test_ds.map(lambda x, y: (norm(x), y))

train_ds = train_ds.cache().shuffle(1000).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().prefetch(tf.data.AUTOTUNE)