In [1]:
!pip install optuna



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.optim as optim
import optuna

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
data = pd.read_csv("fmnist_small.csv")
data

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,9,0,0,0,0,0,0,0,0,0,...,0,7,0,50,205,196,213,165,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,142,142,142,21,0,3,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0,0,0,0,0,0,0,...,213,203,174,151,188,10,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,1,0,0,0,0,0,0,0,0,0,...,69,12,0,0,0,0,0,0,0,0
5996,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5997,8,0,0,0,0,0,0,0,0,0,...,39,47,2,0,0,29,0,0,0,0
5998,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
x = data.drop('label', axis = 1).to_numpy()
y = data['label'].to_numpy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

In [7]:
class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = torch.tensor(features, dtype = torch.float32).reshape(-1, 1,28, 28)
    self.labels = torch.tensor(labels, dtype = torch.long)

  def __len__(self):
    return len(self.features)

  def __getitem__(self, index):
          return self.features[index], self.labels[index]

In [8]:
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

In [9]:
train_dataloader  = DataLoader(train_dataset, batch_size = 128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle = False)

In [10]:
class MyNN(nn.Module):
    def __init__(self, input_features):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(input_features, 32, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64*7*7, 128),
            nn.ReLU(),
            nn.Dropout(p=0.4),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=0.4),

            nn.Linear(64, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)

        return x

In [11]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True)
    epochs = trial.suggest_int("epochs", 10, 100)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)

    model = MyNN(1).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    for epoch in range(epochs):
        model.train()
        for batch_features, batch_labels in train_dataloader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_features, batch_labels in test_dataloader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()

    accuracy = correct / total
    return accuracy

In [12]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

best_params = study.best_trial.params
print("\nBest Hyperparameters Found:")
print(best_params)

[I 2025-06-05 11:02:19,816] A new study created in memory with name: no-name-3105a0c2-271a-48f3-9b01-35de7961ad3d
[I 2025-06-05 11:02:32,399] Trial 0 finished with value: 0.7433333333333333 and parameters: {'learning_rate': 0.0003491347994982412, 'epochs': 70, 'weight_decay': 5.65880454557466e-06}. Best is trial 0 with value: 0.7433333333333333.
[I 2025-06-05 11:02:43,275] Trial 1 finished with value: 0.8608333333333333 and parameters: {'learning_rate': 0.057028576999005015, 'epochs': 71, 'weight_decay': 1.0894690937307742e-05}. Best is trial 1 with value: 0.8608333333333333.
[I 2025-06-05 11:02:56,080] Trial 2 finished with value: 0.8733333333333333 and parameters: {'learning_rate': 0.017615255223034358, 'epochs': 85, 'weight_decay': 3.5734258701592066e-06}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-06-05 11:03:00,102] Trial 3 finished with value: 0.7633333333333333 and parameters: {'learning_rate': 0.0012160624465077027, 'epochs': 26, 'weight_decay': 2.4088328997211767e


Best Hyperparameters Found:
{'learning_rate': 0.007222050131960295, 'epochs': 72, 'weight_decay': 1.3345998916176452e-06}


In [13]:
model = MyNN(1)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=best_params["learning_rate"], weight_decay=best_params["weight_decay"])
epochs = best_params["epochs"]

In [14]:
# training

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_features, batch_labels in train_dataloader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")

Epoch 1/72, Loss: 2.0400
Epoch 2/72, Loss: 1.5526
Epoch 3/72, Loss: 1.2761
Epoch 4/72, Loss: 1.0961
Epoch 5/72, Loss: 0.9838
Epoch 6/72, Loss: 0.9051
Epoch 7/72, Loss: 0.8584
Epoch 8/72, Loss: 0.7811
Epoch 9/72, Loss: 0.7606
Epoch 10/72, Loss: 0.7160
Epoch 11/72, Loss: 0.6866
Epoch 12/72, Loss: 0.6544
Epoch 13/72, Loss: 0.6344
Epoch 14/72, Loss: 0.6124
Epoch 15/72, Loss: 0.6055
Epoch 16/72, Loss: 0.5607
Epoch 17/72, Loss: 0.5625
Epoch 18/72, Loss: 0.5371
Epoch 19/72, Loss: 0.5188
Epoch 20/72, Loss: 0.5093
Epoch 21/72, Loss: 0.5012
Epoch 22/72, Loss: 0.4845
Epoch 23/72, Loss: 0.4701
Epoch 24/72, Loss: 0.4564
Epoch 25/72, Loss: 0.4470
Epoch 26/72, Loss: 0.4333
Epoch 27/72, Loss: 0.4327
Epoch 28/72, Loss: 0.4141
Epoch 29/72, Loss: 0.4128
Epoch 30/72, Loss: 0.3945
Epoch 31/72, Loss: 0.3687
Epoch 32/72, Loss: 0.3664
Epoch 33/72, Loss: 0.3647
Epoch 34/72, Loss: 0.3584
Epoch 35/72, Loss: 0.3391
Epoch 36/72, Loss: 0.3404
Epoch 37/72, Loss: 0.3262
Epoch 38/72, Loss: 0.3128
Epoch 39/72, Loss: 0.

In [15]:
model.eval()

MyNN(
  (features): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (1): ReLU()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (5): ReLU()
    (6): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3136, out_features=128, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.4, inplace=False)
    (7): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [16]:
# evaluation on test data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in test_dataloader:

    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.8625


In [17]:
# evaluation on training data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in train_dataloader:

    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.9875
