<a href="https://colab.research.google.com/github/amgothhrithik/Conv-Neutral-Network/blob/main/Fashion_MNIST_Fine_Tuning_Hyperparameters_using_Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device:{device}")

!kaggle datasets download -d zalando-research/fashionmnist

using device:cuda
Dataset URL: https://www.kaggle.com/datasets/zalando-research/fashionmnist
License(s): other
Downloading fashionmnist.zip to /content
 92% 63.0M/68.8M [00:01<00:00, 74.5MB/s]
100% 68.8M/68.8M [00:01<00:00, 69.5MB/s]


In [None]:
!unzip fashionmnist.zip
import pandas as pd

# Load training data
train_df = pd.read_csv('fashion-mnist_train.csv')
# Load testing data
test_df = pd.read_csv('fashion-mnist_test.csv')


Archive:  fashionmnist.zip
  inflating: fashion-mnist_test.csv  
  inflating: fashion-mnist_train.csv  
  inflating: t10k-images-idx3-ubyte  
  inflating: t10k-labels-idx1-ubyte  
  inflating: train-images-idx3-ubyte  
  inflating: train-labels-idx1-ubyte  


In [None]:
#print(train_df.head())
print(train_df.shape, test_df.shape)

(60000, 785) (10000, 785)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
torch.manual_seed(42)
Y_train=train_df.iloc[:,0].values
X_train=train_df.iloc[:,1:].values/255.0

Y_test=test_df.iloc[:,0].values
X_test=test_df.iloc[:,1:].values/255.0

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(60000, 784)
(10000, 784)
(60000,)
(10000,)


In [None]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
  def __init__(self,features,labels):
    self.features=torch.tensor(features,dtype=torch.float32)
    self.labels=torch.tensor(labels,dtype=torch.long)#.reshape(-1, 1)
  def __len__(self):
    return self.features.shape[0]

  def __getitem__(self,index):


    #tranformation in this
    return self.features[index],self.labels[index]

In [None]:
train_dataset=CustomDataset(X_train,Y_train)
test_dataset=CustomDataset(X_test,Y_test)

In [None]:
batch_size=32
train_dataloader= torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,shuffle=True,pin_memory=True)
test_dataloader= torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,shuffle=True,pin_memory=True)

In [None]:
import torch.nn as nn
import torch.optim as optim
layer=[X_train.shape[1],128,64,10]

# Neural Network Architecture

In [None]:
class ANN(nn.Module):
  def __init__(self,layer):
    super().__init__()
    self.layer_size=len(layer)

    self.network=nn.ModuleDict()

    for i in range(1,self.layer_size):
      self.network[f"W{i}"]=nn.Linear(layer[i-1],layer[i])

      if i<self.layer_size-1:
        self.network[f"batch_Norm{i}"]=nn.BatchNorm1d(layer[i])
        self.network[f"drop_out{i}"]=nn.Dropout(p=0.3)


  def forward(self,X):
    for i in range(1,self.layer_size):
      X=self.network[f"W{i}"](X)

      if i<self.layer_size-1:
        X=self.network[f"batch_Norm{i}"](X)
        X=torch.relu(X)
        X=self.network[f"drop_out{i}"](X)

    return X
model=ANN(layer)
model=model.to(device)

In [None]:
model

ANN(
  (network): ModuleDict(
    (W1): Linear(in_features=784, out_features=128, bias=True)
    (batch_Norm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (drop_out1): Dropout(p=0.3, inplace=False)
    (W2): Linear(in_features=128, out_features=64, bias=True)
    (batch_Norm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (drop_out2): Dropout(p=0.3, inplace=False)
    (W3): Linear(in_features=64, out_features=10, bias=True)
  )
)

# Prediction Function for a Dataset

In [None]:
def predict(dataloader):
  acc=0
  model.eval()
  for batch_features,batch_labels in dataloader:
    batch_features,batch_labels=batch_features.to(device),batch_labels.to(device)
    with torch.no_grad():
      y_pred1=model(batch_features)
      y_pred11=torch.max(y_pred1, 1)[1]
      acc+=(y_pred11==batch_labels).float().mean()
  return acc/len(dataloader)

# Parameters Initization

In [None]:

lr=0.005
epochs=100
loss_func=nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4)
#optim.SGD(model.parameters(),lr,momentum=0.9,weight_decay=1e-2)

In [None]:
def training_model(epochs,dataloader,loss_func,optimizer,lr):
  model.train()

  for epoch in range(epochs+1):
    loss_epoch=0
    for batch_features,batch_labels in dataloader:
      batch_features,batch_labels=batch_features.to(device),batch_labels.to(device)

      y_pred=model(batch_features)

      loss_batch=loss_func(y_pred,batch_labels)
      loss_epoch+=loss_batch.item()/len(dataloader)
      optimizer.zero_grad()
      loss_batch.backward()
      optimizer.step()

  return model.parameters()



In [None]:
model.train()

loss=[]
for epoch in range(epochs+1):
  loss_epoch=0
  for batch_features,batch_labels in train_dataloader:
    batch_features,batch_labels=batch_features.to(device),batch_labels.to(device)

    y_pred=model(batch_features)

    loss_batch=loss_func(y_pred,batch_labels)
    loss_epoch+=loss_batch.item()/len(train_dataloader)
    optimizer.zero_grad()
    loss_batch.backward()
    optimizer.step()
  loss.append(loss_epoch)
  if (epoch ) % (epochs//5) == 0:
    print(f"Epoch {epoch} | Loss: {loss_epoch} |  Accuracy on training_data:{ predict(train_dataloader)} | Accuracy on test_data:{ predict(test_dataloader)}")
    print("="*130)

Epoch 0 | Loss: 0.6986503779570257 |  Accuracy on training_data:0.8165833353996277 | Accuracy on test_data:0.8152955174446106
Epoch 20 | Loss: 0.4115243405262628 |  Accuracy on training_data:0.8405333161354065 | Accuracy on test_data:0.8406549096107483
Epoch 40 | Loss: 0.41338149827718684 |  Accuracy on training_data:0.86326664686203 | Accuracy on test_data:0.8563298583030701
Epoch 60 | Loss: 0.41588735479116457 |  Accuracy on training_data:0.8577666878700256 | Accuracy on test_data:0.8512380123138428
Epoch 80 | Loss: 0.41552710200548204 |  Accuracy on training_data:0.8507166504859924 | Accuracy on test_data:0.8456469774246216
Epoch 100 | Loss: 0.4139802326798434 |  Accuracy on training_data:0.8555166721343994 | Accuracy on test_data:0.8513378500938416


# Fine Tuning Hyperparameters using Optuna

**Architecture**

In [None]:
class MY_ANN(nn.Module):
  def __init__(self,input_dim,output_dim,num_hidden_layers,neurons_per_layer,dropout_rate):
    super().__init__()
    layer=[]
    for i in range(num_hidden_layers):
      layer.append(nn.Linear(input_dim,neurons_per_layer))

      layer.append(nn.BatchNorm1d(neurons_per_layer))
      layer.append(nn.ReLU())
      layer.append(nn.Dropout(p=dropout_rate))
      input_dim=neurons_per_layer
    layer.append(nn.Linear(input_dim,output_dim))

    self.network=nn.Sequential(*layer)
  def forward(self,X):
    return self.network(X)

# Objective Function

In [None]:
#Objective function
def objective(trial):

  #next hyperparameter values from search space
  num_hidden_layers=trial.suggest_int("num_hidden_layers",1,5)
  neurons_per_layer=trial.suggest_int("neurons_per_layer",16,128,step=8)
  epochs=trial.suggest_int("epochs",10,60,step=10)
  lr=trial.suggest_float("lr",1e-5,1e-1,log=True)
  dropout_rate=trial.suggest_float("dropout_rate",0.1,0.4,step=0.05)
  batch_size=trial.suggest_categorical("batch_size",[16,32,64,128])
  optimizer=trial.suggest_categorical('optimizer',["SGD","Adam","RMSprop"])
  wt_decay=trial.suggest_float("wt_decay",1e-5,1e-1,log=True)



  train_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True,pin_memory=True)
  test_dataloader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False,pin_memory=True)

  # input init
  input_dim=784
  output_dim=10

  #Model init
  model=MY_ANN(input_dim,output_dim,num_hidden_layers,neurons_per_layer,dropout_rate)
  model.to(device)

  #loss function
  loss_func=nn.CrossEntropyLoss()
  #optimizer selection
  if optimizer=="Adam":
    optimizer=optim.Adam(model.parameters(),lr,weight_decay=wt_decay)
  elif optimizer=="SGD":
    optimizer=optim.SGD(model.parameters(),lr,momentum=0.9,weight_decay=wt_decay)
  else:
    optimizer=optim.RMSprop(model.parameters(),lr,weight_decay=wt_decay)




  #training loop
  model.train()
  for epoch in range(epochs):
    for batch_features,batch_labels in train_dataloader:
      batch_features,batch_labels=batch_features.to(device),batch_labels.to(device)
      y_pred=model(batch_features)

      loss_batch=loss_func(y_pred,batch_labels)
      optimizer.zero_grad()
      loss_batch.backward()
      optimizer.step()
  #evaluation
  acc=0
  model.eval()
  for batch_features,batch_labels in test_dataloader:
    batch_features,batch_labels=batch_features.to(device),batch_labels.to(device)
    with torch.no_grad():
      y_pred1=model(batch_features)
      y_pred11=torch.max(y_pred1, 1)[1]
      #print(y_pred11)
      acc+=(y_pred11==batch_labels).float().mean()
  return acc/len(test_dataloader)

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import optuna
study=optuna.create_study(direction="maximize")
study.optimize(objective,n_trials=10)

[I 2025-02-04 07:00:29,753] A new study created in memory with name: no-name-7bbd1358-f6cf-49dc-9828-535b84ef851e
[I 2025-02-04 07:04:21,594] Trial 0 finished with value: 0.8850517868995667 and parameters: {'num_hidden_layers': 5, 'neurons_per_layer': 104, 'epochs': 40, 'lr': 0.001226281358718413, 'dropout_rate': 0.4, 'batch_size': 64, 'optimizer': 'Adam', 'wt_decay': 0.00011017284119743356}. Best is trial 0 with value: 0.8850517868995667.
[I 2025-02-04 07:07:40,466] Trial 1 finished with value: 0.877587616443634 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 64, 'epochs': 50, 'lr': 0.0029520968445556754, 'dropout_rate': 0.4, 'batch_size': 64, 'optimizer': 'Adam', 'wt_decay': 0.0001018597179025824}. Best is trial 0 with value: 0.8850517868995667.
[I 2025-02-04 07:13:31,798] Trial 2 finished with value: 0.8064000010490417 and parameters: {'num_hidden_layers': 3, 'neurons_per_layer': 112, 'epochs': 30, 'lr': 0.019105849541904203, 'dropout_rate': 0.35, 'batch_size': 16, 'op

In [None]:
study.best_value

0.8664952516555786

# Training Using CNN  Architecture



In [None]:
from torchvision import transforms

# Define data augmentations for the training dataset
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])

In [None]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
  def __init__(self,features,labels,transform=None):
    self.features=torch.tensor(features,dtype=torch.float32).reshape(-1,1,28,28)
    self.labels=torch.tensor(labels,dtype=torch.long)#.reshape(-1, 1)
    self.tranformer=transform
  def __len__(self):
    return self.features.shape[0]


  def __getitem__(self,index):
    #tranformation
    if self.tranformer:
      features = self.transformer(features.squeeze(0).numpy())
    #tranformation in this
    return self.features[index],self.labels[index]

In [None]:
train_dataset=CustomDataset(X_train,Y_train,transform=train_transform)
test_dataset=CustomDataset(X_test,Y_test,transform=test_transform)

In [None]:
class CNN(nn.Module):
  def __init__(self,num_cnn_layers,filter_size,num_fc_layers,fc_neurons,dropout_rate):
    super().__init__()

    # Convolutional layers
    cnn_layer=[]
    low=3   # CIFAR-10 images have 3 channels (RGB)
    for i in range(num_cnn_layers):
      cnn_layer.append(nn.Conv2d(low,filter_size[i],kernel_size=5,padding="same"))
      cnn_layer.append(nn.ReLU())
      cnn_layer.append(nn.BatchNorm2d(filter_size[i]))
      cnn_layer.append(nn.MaxPool2d(kernel_size=2,stride=2))
      low=filter_size[i]

    # Fully connected layers
    ann_layer=[nn.Flatten()]

    for i in range(num_fc_layers-1):

      ann_layer.append(nn.Linear(fc_neurons[i],fc_neurons[i+1]))
      ann_layer.append(nn.BatchNorm1d(fc_neurons[i+1]))
      ann_layer.append(nn.ReLU())
      ann_layer.append(nn.Dropout(p=dropout_rate))
    # 10 classes for CIFAR-10
    ann_layer.append(nn.Linear(fc_neurons[-1],10))


    self.cnn_network=nn.Sequential(*cnn_layer)
    self.fc_network=nn.Sequential(*ann_layer)


  def forward(self,X):
    X=self.cnn_network(X)
    return self.fc_network(X)


In [None]:

def objective(trial):

  #hyperparameter to be tuned  from search space
  num_cnn_layers=trial.suggest_int("num_cnn_layers",1,3)
  # Filter sizes (with increasing sizes as layers increase)
  filter_size=[]
  filter_val=trial.suggest_int(f"filter",5,16)
  for i in range(num_cnn_layers):
    filter_size.append(filter_val)
    filter_val=filter_val*2

  filter_val = trial.suggest_int(f"filter", 5, 16)
  filter_size = [filter_val := filter_val * 2 if i > 0 else filter_val for i in range(num_cnn_layers)]

  num_fc_layers=trial.suggest_int("num_fc_layers",2,5)



  output_size = 28
  for _ in range(num_cnn_layers):
      output_size = output_size // 2  # Since MaxPool halves the size
  fc_neurons=[filter_size[-1] * output_size * output_size]
  steps=32
  high_fc=((fc_neurons[-1]/4)//steps)*steps



  low_fc=((high_fc/10)//steps)*steps
  neurons=trial.suggest_int(f"neurons_per_layer",low_fc,high_fc,step=steps)
  for i in range(num_fc_layers-1):

    fc_neurons.append(neurons)
    neurons=max(steps,neurons//2)

 # print("num_cnn_layers",num_cnn_layers,"\n","filter_size",filter_size,"\n","num_fc_layers","\n",num_fc_layers,"\n","fc_neurons",fc_neurons)


  # Additional hyperparameters
  dropout_rate=trial.suggest_float("dropout_rate",0.1,0.6,step=0.1)
  batch_size=trial.suggest_categorical("batch_size",[32,64,128])
  optimizer=trial.suggest_categorical('optimizer',["SGD","Adam","RMSprop"])
  wt_decay=trial.suggest_float("wt_decay",1e-5,1e-1,log=True)
  epochs=trial.suggest_int("epochs",10,30,step=5)
  lr=trial.suggest_float("lr",1e-5,1e-1,log=True)

  # Build the model
  model=CNN(num_cnn_layers,filter_size,num_fc_layers,fc_neurons,dropout_rate)
  model.to(device)

  #loss function
  loss_func=nn.CrossEntropyLoss()
  #optimizer selection
  if optimizer=="Adam":
    optimizer=optim.Adam(model.parameters(),lr,weight_decay=wt_decay)
  elif optimizer=="SGD":
    optimizer=optim.SGD(model.parameters(),lr,momentum=0.9,weight_decay=wt_decay)
  else:
    optimizer=optim.RMSprop(model.parameters(),lr,weight_decay=wt_decay)

  # Data Loaders
  trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,shuffle=True, num_workers=2)
  testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,shuffle=False, num_workers=2)






  #training loop
  model.train()
  for epoch in range(epochs):
    for batch_features,batch_labels in trainloader:
      batch_features,batch_labels=batch_features.to(device),batch_labels.to(device)
      y_pred=model(batch_features)

      loss_batch=loss_func(y_pred,batch_labels)
      optimizer.zero_grad()
      loss_batch.backward()
      optimizer.step()
  #evaluation
  acc=0
  model.eval()
  with torch.no_grad():
    for batch_features,batch_labels in testloader:
      batch_features,batch_labels=batch_features.to(device),batch_labels.to(device)

      y_pred1=model(batch_features)
      _, predicted=torch.max(y_pred1, 1)[1]
      #print(y_pred11)
      acc+=(predicted == batch_labels).float().mean().item()
  return acc/len(testloader)

In [None]:
filter_val=6

filter_size = [filter_val := int(filter_val * 1.5) if i > 0 else filter_val for i in range(4)]

In [None]:
filter_size

[6, 9, 13, 19]