In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


### 1. The "On-the-Fly" Augmentation Workflow (Summary)

**The Concept:**
Instead of creating a giant dataset of modified images *before* training, we modify the images *during* training, right before the model sees them.

**The Step-by-Step Flow:**

1.  **Storage:** Your memory (RAM) holds only the **original** clean images (e.g., 50,000 images).
2.  **The Trigger (Epoch Loop):** The training loop starts. The `DataLoader` requests a batch of 100 images.
3.  **The Fetch & Transform (CPU Work):**
    * The CPU grabs 100 original images from memory.
    * The CPU applies the `transform` pipeline to these 100 images.
    * *Crucial:* Since the transforms are **random**, the CPU might flip Image #5 this time, but rotate it next time.
4.  **The Handoff (GPU Work):** The CPU hands these 100 *newly modified* tensors to the GPU.
5.  **Training:** The GPU calculates predictions, loss, and updates weights. The modified images are then **discarded**.
6.  **The Repeat:** In the next Epoch, when the code loops back to Image #5, the CPU grabs the *original* again, applies *fresh* random transforms, and the model sees a version it has likely never seen before.

**The Math:**
* **Memory Used:** $1 \times \text{Dataset Size}$
* **Images Seen by Model:** $\text{Epochs} \times \text{Dataset Size}$

---

### 2. Why is this the Industry Standard?

You asked why "on-the-fly" is preferred over creating a static, larger dataset (e.g., saving 5 flipped versions of every image to your hard drive). There are three massive reasons:

#### A. Infinite Variety (Generalization)
If you pre-generate 5 versions of an image, your model only ever sees those 5 specific versions. It might memorize that "Rotated 10 degrees = Dog".

With on-the-fly augmentation, the model might see:
* **Epoch 1:** Rotated 2°
* **Epoch 2:** Rotated -5° + Flipped
* **Epoch 3:** Rotated 8°

**Result:** The model learns the *concept* of a "Dog" regardless of orientation, rather than memorizing specific pixel patterns. This drastically reduces **overfitting**.

#### B. Storage Constraints (The 1TB Problem)
Real-world datasets are huge.
* **ImageNet** is ~150 GB.
* If you wanted to pre-augment it 10 times, you would need **1.5 TB** of disk space.
* If you use on-the-fly augmentation, you need **0 extra bytes** of disk space.

#### C. Free computational time (CPU/GPU Pipelining)
You might think, *"Doesn't transforming images slow down training?"*

**Actually, no.** PyTorch uses "Multiprocessing" (the `num_workers` argument in DataLoader).
1.  While the **GPU** is busy sweating over the heavy matrix multiplication for *Batch 1*...
2.  The **CPU** is effectively idle. So, it uses that "free time" to prepare and augment *Batch 2*.
3.  By the time the GPU finishes Batch 1, Batch 2 is already waiting. This makes augmentation "computationally free."

In [2]:
# import optuna
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
device

device(type='cuda')

In [5]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "fashion-mnist_train.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "zalando-research/fashionmnist",
  file_path
)

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'fashionmnist' dataset.


In [6]:
x, y = df.iloc[:, 1:], df.iloc[:, 0]

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [8]:
xtrain = xtrain/255
xtest = xtest/255

In [9]:
augment_pipeline = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10)
    ]
)

In [10]:
class CustomDataset(Dataset):

  def __init__(self, features, labels, transform=None):
    # What -1 does: It tells PyTorch, "I don't want to count exactly how many images are in this array manually. You calculate it."
    self.features = torch.tensor(features, dtype=torch.float32).reshape(-1, 1, 28, 28)
    self.labels = torch.tensor(labels, dtype=torch.long)
    self.transform = transform

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    img, label = self.features[idx], self.labels[idx]

    if self.transform:
      img = self.transform(img)

    return img, label

In [11]:
traindataset = CustomDataset(xtrain.values, ytrain.values, transform=augment_pipeline)
testdataset = CustomDataset(xtest.values, ytest.values)

In [12]:
traindataloader = DataLoader(traindataset, batch_size=100, shuffle=True, pin_memory=True)
testdataloader = DataLoader(testdataset, batch_size=100, shuffle=False, pin_memory=True)

In [13]:
class myCNN(nn.Module):

  def __init__(self, input_features):
    super().__init__()
    self.features = nn.Sequential(
        nn.Conv2d(input_features, 32, 3, padding='same'),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),

        nn.Conv2d(32, 64, kernel_size=3, padding='same'),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
    )

    self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(64*7*7, 128),
        nn.ReLU(),
        nn.Dropout(0.4),

        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Dropout(0.4),

        nn.Linear(64, 10)
    )

  def forward(self, x):
    x = self.features(x)
    x = self.classifier(x)
    return x

In [None]:
learning_rate = 0.1
epochs = 100

model = myCNN(input_features=1)

model.to(device)

loss_function = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(epochs):

  epoch_loss = []

  for batch_features, batch_labels in traindataloader:

    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    prediction = model(batch_features)

    loss = loss_function(prediction, batch_labels)

    epoch_loss.append(loss.item())

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

  print(f'epoch {epoch} ---> loss {np.mean(np.array(epoch_loss))}')

epoch 0 ---> loss 0.7823931688070297
epoch 1 ---> loss 0.540128882461124
epoch 2 ---> loss 0.47762037800418006
epoch 3 ---> loss 0.43995882040924494
epoch 4 ---> loss 0.41460473120212554
epoch 5 ---> loss 0.3935819516248173
epoch 6 ---> loss 0.38472540855407716
epoch 7 ---> loss 0.3642635241150856
epoch 8 ---> loss 0.35435586922698553
epoch 9 ---> loss 0.3394855343302091
epoch 10 ---> loss 0.336194695631663
epoch 11 ---> loss 0.3266560067070855
epoch 12 ---> loss 0.3195132307211558
epoch 13 ---> loss 0.3144393637776375
epoch 14 ---> loss 0.3057967420419057
epoch 15 ---> loss 0.29839123795429867
epoch 16 ---> loss 0.2916803477538957
epoch 17 ---> loss 0.28814259697993594
epoch 18 ---> loss 0.285796532250113
epoch 19 ---> loss 0.28140907242894175
epoch 20 ---> loss 0.277144201911158
epoch 21 ---> loss 0.2710143254035049
epoch 22 ---> loss 0.26881766453385353
epoch 23 ---> loss 0.2612600562142001
epoch 24 ---> loss 0.26112108811736107
epoch 25 ---> loss 0.2589122547705968
epoch 26 ---> lo

In [None]:
model.eval()


with torch.no_grad():
  pred = model(testdataset.features.to(device))
  pred = torch.argmax(pred, dim=1)
  score = accuracy_score(testdataset.labels.cpu(), pred.cpu())

print(score)