<a href="https://colab.research.google.com/github/arpitpatelsitapur/my-py-torch-journey/blob/main/Fashion_MNIST_pytorch_ANN_model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Checking in only 6000 rows of training and 1000 of testing**

In [19]:
# we load MNIST data from keras, there are other methods too.
from keras import datasets
(X_train, y_train), (X_test, y_test) =datasets.fashion_mnist.load_data()

In [20]:
import pandas as pd

# Reshape the arrays to be 2-dimensional
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
y_train = y_train.reshape(y_train.shape[0], -1)
y_test = y_test.reshape(y_test.shape[0], -1)

# convert into Dataframe
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [21]:
X_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,119,114,130,76,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,22,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,33,96,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
print("X_train.shape = ", X_train.shape)
print("X_test.shape = ", X_test.shape)
print("-"*100)
print("Keeping only 6000 in training and 1000 for testing.")
X_train=X_train.head(6000)
X_test=X_test.head(1000)
y_train=y_train.head(6000)
y_test=y_test.head(1000)
print("-"*100)
print("X_train.shape = ", X_train.shape)
print("X_test.shape = ", X_test.shape)


X_train.shape =  (60000, 784)
X_test.shape =  (10000, 784)
----------------------------------------------------------------------------------------------------
Keeping only 6000 in training and 1000 for testing.
----------------------------------------------------------------------------------------------------
X_train.shape =  (6000, 784)
X_test.shape =  (1000, 784)


In [23]:
# if running first time, u need to install torchinfo
!pip install torchinfo



In [24]:
import pandas as pd
import torch
import torch.nn as nn
from torchinfo import summary
import torch.optim as optim
torch.manual_seed(42)

<torch._C.Generator at 0x78043b524450>

In [25]:
X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.long)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.long)

In [26]:
print(f"X_train_t.shape = {X_train_t.shape}, y_train_t.shape = {y_train_t.shape}")
print(f"X_test_t.shape = {X_test_t.shape}, y_test_t.shape = {y_test_t.shape}")

X_train_t.shape = torch.Size([6000, 784]), y_train_t.shape = torch.Size([6000, 1])
X_test_t.shape = torch.Size([1000, 784]), y_test_t.shape = torch.Size([1000, 1])


In [27]:
# define datset and dataloader
from torch.utils.data import Dataset,DataLoader

class custom_dataset(Dataset):
  def __init__(self,X,y):
    self.X=X
    self.y=y
    self.n_samples=X.shape[0]

  def __len__(self):
    return self.n_samples

  def __getitem__(self,index):
    return self.X[index],self.y[index]

train_dataset=custom_dataset(X_train_t,y_train_t)
test_dataset=custom_dataset(X_test_t,y_test_t)

train_loader=DataLoader(dataset=train_dataset,batch_size=150,shuffle=True)
test_loader=DataLoader(dataset=test_dataset,batch_size=150,shuffle=True)

for batch_X,batch_y in test_loader:
  # print(batch_X)
  # print(batch_y)
  print(f"batch_X.shape = {batch_X.shape}, batch_y.shape = {batch_y.shape}")
  print("-"*50)

batch_X.shape = torch.Size([150, 784]), batch_y.shape = torch.Size([150, 1])
--------------------------------------------------
batch_X.shape = torch.Size([150, 784]), batch_y.shape = torch.Size([150, 1])
--------------------------------------------------
batch_X.shape = torch.Size([150, 784]), batch_y.shape = torch.Size([150, 1])
--------------------------------------------------
batch_X.shape = torch.Size([150, 784]), batch_y.shape = torch.Size([150, 1])
--------------------------------------------------
batch_X.shape = torch.Size([150, 784]), batch_y.shape = torch.Size([150, 1])
--------------------------------------------------
batch_X.shape = torch.Size([150, 784]), batch_y.shape = torch.Size([150, 1])
--------------------------------------------------
batch_X.shape = torch.Size([100, 784]), batch_y.shape = torch.Size([100, 1])
--------------------------------------------------


In [28]:
## **ANN model structure**
# - input layer (784)
# - 2 hidden layer (each 128)
# - 1 output layer
# - relu in hidden layers
# - softmax in output layer

# Define model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))      # hidden layer1 with ReLU
        x = torch.relu(self.fc2(x))      # hidden layer1 with ReLU
        x = self.fc3(x)  # output layer (remove softmax, CrossEntropyLoss includes it)
        return x

In [29]:
# Hyperparameters
input_size = X_train_t.shape[1]
hidden_size = 128
output_size = 10  # 10 classes for Fashion MNIST
num_epochs = 50
lr = 0.001

model = SimpleNN(input_size, hidden_size, output_size)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

summary(model)

Layer (type:depth-idx)                   Param #
SimpleNN                                 --
├─Linear: 1-1                            100,480
├─Linear: 1-2                            16,512
├─Linear: 1-3                            1,290
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0

In [30]:
# Training (one batch of dataset per epoch)
for epoch in range(num_epochs):
    total_loss=0
    for batch_X, batch_y in train_loader:
        # forward pass
        outputs = model(batch_X)
        # loss calculation
        l = loss(outputs, batch_y.squeeze(1))
        # backward pass
        optimizer.zero_grad()
        l.backward()
        # updating grads
        optimizer.step()
        total_loss+=l.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {total_loss/len(train_loader):.4f}")
    print("-"*100)

Epoch [1/50], Avg Loss: 4.0186
----------------------------------------------------------------------------------------------------
Epoch [2/50], Avg Loss: 0.6457
----------------------------------------------------------------------------------------------------
Epoch [3/50], Avg Loss: 0.5392
----------------------------------------------------------------------------------------------------
Epoch [4/50], Avg Loss: 0.4846
----------------------------------------------------------------------------------------------------
Epoch [5/50], Avg Loss: 0.4359
----------------------------------------------------------------------------------------------------
Epoch [6/50], Avg Loss: 0.4257
----------------------------------------------------------------------------------------------------
Epoch [7/50], Avg Loss: 0.3871
----------------------------------------------------------------------------------------------------
Epoch [8/50], Avg Loss: 0.3452
---------------------------------------------

In [31]:
# set model to evaluation mode
model.eval()

SimpleNN(
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)

In [32]:
# testing
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += batch_y.size(0)
        n_correct += (predicted == batch_y.squeeze(1)).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 1000 test images: {acc} %')

Accuracy of the network on the 1000 test images: 82.0 %


## **Applying this ANN in complete data**

In [33]:
import pandas as pd
import torch
import torch.nn as nn
from torchinfo import summary
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from keras import datasets
torch.manual_seed(16)


# load dataset
(X_train, y_train), (X_test, y_test) =datasets.fashion_mnist.load_data()


# Reshape the arrays to be 2-dimensional
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
y_train = y_train.reshape(y_train.shape[0], -1)
y_test = y_test.reshape(y_test.shape[0], -1)

# convert into Dataframe
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

# convert into tensors
X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.long)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.long)

# dataset and dataloader
class custom_dataset(Dataset):
  def __init__(self,X,y):
    self.X=X
    self.y=y
    self.n_samples=X.shape[0]

  def __len__(self):
    return self.n_samples

  def __getitem__(self,index):
    return self.X[index],self.y[index]

train_dataset=custom_dataset(X_train_t,y_train_t)
test_dataset=custom_dataset(X_test_t,y_test_t)

train_loader=DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)
test_loader=DataLoader(dataset=test_dataset,batch_size=32,shuffle=False)

# Define model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,output_size)
        )

    def forward(self, x):
        return self.model(x)


# Hyperparameters
input_size = X_train_t.shape[1]
hidden_size = 128
output_size = 10  # 10 classes for Fashion MNIST
num_epochs = 50
lr = 0.001

model = SimpleNN(input_size, hidden_size, output_size)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
print(summary(model))

# Training (one batch of dataset per epoch)
for epoch in range(num_epochs):
    total_loss=0
    for batch_X, batch_y in train_loader:
        # forward pass
        outputs = model(batch_X)
        # loss calculation
        l = loss(outputs, batch_y.squeeze(1))
        # backward pass
        optimizer.zero_grad()
        l.backward()
        # updating grads
        optimizer.step()
        total_loss+=l.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")
    print("="*65)



# set model to evaluation mode
model.eval()

# testing
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += batch_y.size(0)
        n_correct += (predicted == batch_y.squeeze(1)).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the test images: {acc} %')

Layer (type:depth-idx)                   Param #
SimpleNN                                 --
├─Sequential: 1-1                        --
│    └─Linear: 2-1                       100,480
│    └─ReLU: 2-2                         --
│    └─Linear: 2-3                       16,512
│    └─ReLU: 2-4                         --
│    └─Linear: 2-5                       1,290
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0
Epoch [1/50], Loss: 0.6750
Epoch [2/50], Loss: 0.4707
Epoch [3/50], Loss: 0.4279
Epoch [4/50], Loss: 0.4060
Epoch [5/50], Loss: 0.3921
Epoch [6/50], Loss: 0.3831
Epoch [7/50], Loss: 0.3714
Epoch [8/50], Loss: 0.3606
Epoch [9/50], Loss: 0.3515
Epoch [10/50], Loss: 0.3521
Epoch [11/50], Loss: 0.3418
Epoch [12/50], Loss: 0.3400
Epoch [13/50], Loss: 0.3368
Epoch [14/50], Loss: 0.3325
Epoch [15/50], Loss: 0.3288
Epoch [16/50], Loss: 0.3250
Epoch [17/50], Loss: 0.3327
Epoch [18/50], Loss: 0.3198
Epoch [19/50], Loss: 0.3200
Epoch [20/50], Loss: 0.3241
Epoch [2

## **Using GPU to Speedup Process**
1. Check for gpu device availability.
2. move model to gpu.
3. use `pin_memory=True` parameter for faster data loading.
4. move batches to gpu in both training and testing phase.
5. use larger size batches.

In [34]:
import pandas as pd
import torch
import torch.nn as nn
from torchinfo import summary
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from keras import datasets
torch.manual_seed(7)

# check gpu availability
if torch.cuda.is_available():
  device=torch.device("cuda")
else:
  device=torch.device("cpu")

# load dataset
(X_train, y_train), (X_test, y_test) =datasets.fashion_mnist.load_data()


# Reshape the arrays to be 2-dimensional
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
y_train = y_train.reshape(y_train.shape[0], -1)
y_test = y_test.reshape(y_test.shape[0], -1)

# convert into Dataframe
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

# convert into tensors
X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.long)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.long)

# dataset and dataloader
class custom_dataset(Dataset):
  def __init__(self,X,y):
    self.X=X
    self.y=y
    self.n_samples=X.shape[0]

  def __len__(self):
    return self.n_samples

  def __getitem__(self,index):
    return self.X[index],self.y[index]

train_dataset=custom_dataset(X_train_t,y_train_t)
test_dataset=custom_dataset(X_test_t,y_test_t)

train_loader=DataLoader(dataset=train_dataset,batch_size=256,shuffle=True,pin_memory=True)
test_loader=DataLoader(dataset=test_dataset,batch_size=256,shuffle=False,pin_memory=True)

# Define model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,output_size)
        )

    def forward(self, x):
        return self.model(x)


# Hyperparameters
input_size = X_train_t.shape[1]
hidden_size = 128
output_size = 10  # 10 classes for Fashion MNIST
num_epochs = 100
lr = 0.001

model = SimpleNN(input_size, hidden_size, output_size)
# move model to GPU
model = model.to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
print(summary(model))

# Training (one batch of dataset per epoch)
for epoch in range(num_epochs):
    total_loss=0
    for batch_X, batch_y in train_loader:

        # move batches to GPU
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        # forward pass
        outputs = model(batch_X)
        # loss calculation
        l = loss(outputs, batch_y.squeeze(1))
        # backward pass
        optimizer.zero_grad()
        l.backward()
        # updating grads
        optimizer.step()
        total_loss+=l.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {total_loss/len(train_loader):.4f}")
    print("="*65)

# checking accuracy in training data
# checking if our model is overfitted
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for batch_X, batch_y in train_loader:
        # move batches to GPU
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        outputs = model(batch_X)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += batch_y.size(0)
        n_correct += (predicted == batch_y.squeeze(1)).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the training images: {acc} %')

# set model to evaluation mode
model.eval()

# testing
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for batch_X, batch_y in test_loader:
        # move batches to GPU
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        outputs = model(batch_X)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += batch_y.size(0)
        n_correct += (predicted == batch_y.squeeze(1)).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the test images: {acc} %')

Layer (type:depth-idx)                   Param #
SimpleNN                                 --
├─Sequential: 1-1                        --
│    └─Linear: 2-1                       100,480
│    └─ReLU: 2-2                         --
│    └─Linear: 2-3                       16,512
│    └─ReLU: 2-4                         --
│    └─Linear: 2-5                       1,290
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0
Epoch [1/100], Avg Loss: 0.9130
Epoch [2/100], Avg Loss: 0.4426
Epoch [3/100], Avg Loss: 0.3908
Epoch [4/100], Avg Loss: 0.3624
Epoch [5/100], Avg Loss: 0.3423
Epoch [6/100], Avg Loss: 0.3339
Epoch [7/100], Avg Loss: 0.3168
Epoch [8/100], Avg Loss: 0.3188
Epoch [9/100], Avg Loss: 0.3109
Epoch [10/100], Avg Loss: 0.2953
Epoch [11/100], Avg Loss: 0.2920
Epoch [12/100], Avg Loss: 0.2829
Epoch [13/100], Avg Loss: 0.2831
Epoch [14/100], Avg Loss: 0.2787
Epoch [15/100], Avg Loss: 0.2679
Epoch [16/100], Avg Loss: 0.2728
Epoch [17/100], Avg Loss: 0.2606
Epoch [

# **Optimizing model performance**
- Dropout using `p=0.3`
- Batch Normalization using `BatchNorm1D`
- Early stopping
- Regularization using `weight_decay`
- Adding more data
- Reducing Network complexity
- Data augmentation

In [35]:
import pandas as pd
import torch
import torch.nn as nn
from torchinfo import summary
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from keras import datasets
torch.manual_seed(7)

# check gpu availability
if torch.cuda.is_available():
  device=torch.device("cuda")
else:
  device=torch.device("cpu")

# load dataset
(X_train, y_train), (X_test, y_test) =datasets.fashion_mnist.load_data()


# Reshape the arrays to be 2-dimensional
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
y_train = y_train.reshape(y_train.shape[0], -1)
y_test = y_test.reshape(y_test.shape[0], -1)

# convert into Dataframe
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

# convert into tensors
X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.long)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.long)

# dataset and dataloader
class custom_dataset(Dataset):
  def __init__(self,X,y):
    self.X=X
    self.y=y
    self.n_samples=X.shape[0]

  def __len__(self):
    return self.n_samples

  def __getitem__(self,index):
    return self.X[index],self.y[index]

train_dataset=custom_dataset(X_train_t,y_train_t)
test_dataset=custom_dataset(X_test_t,y_test_t)

train_loader=DataLoader(dataset=train_dataset,batch_size=256,shuffle=True,pin_memory=True)
test_loader=DataLoader(dataset=test_dataset,batch_size=256,shuffle=False,pin_memory=True)

# Define model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size,hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size,output_size)
        )

    def forward(self, x):
        return self.model(x)


# Hyperparameters
input_size = X_train_t.shape[1]
hidden_size = 128
output_size = 10  # 10 classes for Fashion MNIST
num_epochs = 100
lr = 0.001

model = SimpleNN(input_size, hidden_size, output_size)
# move model to GPU
model = model.to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
print(summary(model))

# Training (one batch of dataset per epoch)
for epoch in range(num_epochs):
    total_loss=0
    for batch_X, batch_y in train_loader:

        # move batches to GPU
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        # forward pass
        outputs = model(batch_X)
        # loss calculation
        l = loss(outputs, batch_y.squeeze(1))
        # backward pass
        optimizer.zero_grad()
        l.backward()
        # updating grads
        optimizer.step()
        total_loss+=l.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {total_loss/len(train_loader):.4f}")
    print("="*65)

# checking accuracy in training data
# checking if our model is overfitted
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for batch_X, batch_y in train_loader:
        # move batches to GPU
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        outputs = model(batch_X)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += batch_y.size(0)
        n_correct += (predicted == batch_y.squeeze(1)).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the training images: {acc} %')

# set model to evaluation mode
model.eval()

# testing accuracy
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for batch_X, batch_y in test_loader:
        # move batches to GPU
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        outputs = model(batch_X)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += batch_y.size(0)
        n_correct += (predicted == batch_y.squeeze(1)).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the testing images: {acc} %')

Layer (type:depth-idx)                   Param #
SimpleNN                                 --
├─Sequential: 1-1                        --
│    └─Linear: 2-1                       100,480
│    └─BatchNorm1d: 2-2                  256
│    └─ReLU: 2-3                         --
│    └─Dropout: 2-4                      --
│    └─Linear: 2-5                       16,512
│    └─BatchNorm1d: 2-6                  256
│    └─ReLU: 2-7                         --
│    └─Dropout: 2-8                      --
│    └─Linear: 2-9                       1,290
Total params: 118,794
Trainable params: 118,794
Non-trainable params: 0
Epoch [1/100], Avg Loss: 0.6251
Epoch [2/100], Avg Loss: 0.4244
Epoch [3/100], Avg Loss: 0.3863
Epoch [4/100], Avg Loss: 0.3614
Epoch [5/100], Avg Loss: 0.3419
Epoch [6/100], Avg Loss: 0.3316
Epoch [7/100], Avg Loss: 0.3208
Epoch [8/100], Avg Loss: 0.3135
Epoch [9/100], Avg Loss: 0.3056
Epoch [10/100], Avg Loss: 0.2982
Epoch [11/100], Avg Loss: 0.2918
Epoch [12/100], Avg Loss: 0