In [41]:
import torch
import numpy as np
from torchvision.datasets import MNIST
from torchvision import transforms

In [42]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

In [43]:
train_data = MNIST(root='data', train=True, download=False, transform=transform)

In [44]:
len(train_data)

60000

In [45]:
val_size = 0.2
batch_size = 50

In [46]:
from torch.utils.data import DataLoader


train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [47]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x14578086bd0>

In [48]:
# Let's check the shape of the input/target data
for data, target in train_loader:
    print(data.shape)
    print(target.shape)
    break

torch.Size([50, 1, 28, 28])
torch.Size([50])


In [72]:
from torch import nn

In [73]:
model = nn.Sequential(
    nn.Flatten(),            # Flatten input from (batch_size, 28, 28) to (batch_size, 784)
    nn.Linear(784, 512),     # First linear layer
    nn.ReLU(),               # Activation function
    nn.Dropout(0.3),       # Optional dropout for regularization
    nn.Linear(512, 128),     # Second linear layer
    nn.ReLU(),               # Activation function
    nn.Dropout(0.3),       # Optional dropout for regularization
    nn.Linear(128, 10)       # Output layer for 10 classes
)

In [74]:
# Loss Function and Optimizer

criterion = nn.CrossEntropyLoss()

from torch import optim

optimizer = optim.SGD(model.parameters(), lr=0.01)

In [75]:
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.3, inplace=False)
  (4): Linear(in_features=512, out_features=128, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.3, inplace=False)
  (7): Linear(in_features=128, out_features=10, bias=True)
)

In [76]:
print(torch.cuda.is_available())

False


In [77]:
'cude' if torch.cuda.is_available() else 'cpu'

'cpu'

In [78]:
device = torch.device('cude' if torch.cuda.is_available() else 'cpu') 

In [79]:
device

device(type='cpu')

In [80]:
# Taking the model to avialable 'device'
model.to(device)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.3, inplace=False)
  (4): Linear(in_features=512, out_features=128, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.3, inplace=False)
  (7): Linear(in_features=128, out_features=10, bias=True)
)

In [81]:
%%time
for epoch in range(1,11):

    train_loss = []

    model.train()
    for features, target in train_loader:
        features, target = features.to(device), target.to(device)

        optimizer.zero_grad()

        output = model(features)

        loss = criterion(output, target)

        loss.backward()

        optimizer.step()

        train_loss.append(loss.item())

    print(f'Epoch: {epoch}, Loss: {np.mean(train_loss):.4f}')

Epoch: 1, Loss: 0.7831
Epoch: 2, Loss: 0.3409
Epoch: 3, Loss: 0.2662
Epoch: 4, Loss: 0.2215
Epoch: 5, Loss: 0.1927
Epoch: 6, Loss: 0.1707
Epoch: 7, Loss: 0.1522
Epoch: 8, Loss: 0.1400
Epoch: 9, Loss: 0.1272
Epoch: 10, Loss: 0.1170
CPU times: total: 6min 25s
Wall time: 4min 25s


In [82]:
print("printing our model: \n\n", model)

printing our model: 

 Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.3, inplace=False)
  (4): Linear(in_features=512, out_features=128, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.3, inplace=False)
  (7): Linear(in_features=128, out_features=10, bias=True)
)


In [83]:
print("Models layer keys: \n\n", model.state_dict().keys())

Models layer keys: 

 odict_keys(['1.weight', '1.bias', '4.weight', '4.bias', '7.weight', '7.bias'])


In [85]:
for params, values in model.state_dict().items(): 
    print(params, ":", values)

1.weight : tensor([[ 2.5667e-02, -8.6721e-03,  2.2859e-02,  ..., -2.4975e-03,
         -6.5927e-05, -2.0415e-02],
        [-1.7991e-02, -1.1361e-02, -1.9493e-02,  ..., -7.1986e-03,
         -1.8471e-02,  2.0117e-02],
        [-7.5475e-03,  5.5972e-03,  3.5059e-02,  ...,  3.5615e-02,
         -2.1697e-02,  1.7564e-04],
        ...,
        [ 3.2161e-02,  7.3295e-03, -2.2093e-02,  ...,  2.6253e-02,
          6.7193e-03,  2.5626e-02],
        [-1.8333e-02,  3.2315e-02, -2.3867e-02,  ..., -2.1071e-02,
          3.2737e-02,  1.7964e-02],
        [-2.3316e-02, -1.7308e-02, -1.9036e-02,  ...,  3.5937e-02,
          9.7518e-03, -5.6643e-03]])
1.bias : tensor([-1.4592e-02,  1.1928e-02,  2.3046e-02, -1.6126e-02, -6.6139e-03,
         6.8304e-03, -2.9327e-02, -1.0291e-03, -1.7379e-02, -3.1357e-02,
         2.0865e-02,  3.2411e-04, -3.3225e-02, -1.4372e-02,  3.2821e-02,
         1.3659e-02, -2.0996e-02,  3.1739e-02, -2.8363e-02,  4.1561e-03,
        -3.9263e-03,  2.2866e-02,  3.2274e-02, -1.6454e-

In [86]:
torch.save(model.state_dict(), 'model.pth')

In [88]:
state_dict = torch.load('model.pth')
print(state_dict.keys())

odict_keys(['1.weight', '1.bias', '4.weight', '4.bias', '7.weight', '7.bias'])


  state_dict = torch.load('model.pth')


In [92]:
model = nn.Sequential(
    nn.Flatten(),            # Flatten input from (batch_size, 28, 28) to (batch_size, 784)
    nn.Linear(784, 512),     # First linear layer
    nn.ReLU(),               # Activation function
    nn.Dropout(0.3),       # Optional dropout for regularization
    nn.Linear(512, 128),     # Second linear layer
    nn.ReLU(),               # Activation function
    nn.Dropout(0.3),       # Optional dropout for regularization
    nn.Linear(128, 10)       # Output layer for 10 classes
)

In [93]:
model.load_state_dict(state_dict)

<All keys matched successfully>

# Batch Normalization:
Added Batch Normalization after the linear but before the non linear activation function

In [94]:
from torch import nn, optim
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 512)
        self.bn1 = nn.BatchNorm1d(num_features=512) # batch norm layer 1
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(num_features=256) # batch norm layer 2
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(num_features=128)  # batch norm layer 3                         
        self.fc4 = nn.Linear(128, 56)
        self.bn4 = nn.BatchNorm1d(num_features=56)   # batch norm layer 4 
        self.fc5 = nn.Linear(56, 10)
        
        #drop out with 0.3 probability
        self.dropout = nn.Dropout(p=0.3)
        
    def forward(self, x):
        # input tensor is flattened 
        x = x.view(x.shape[0], -1)
        
        # applied dropout layer
        x = self.dropout(F.relu(self.bn1(self.fc1(x))))
        x = self.dropout(F.relu(self.bn2(self.fc2(x))))
        x = self.dropout(F.relu(self.bn3(self.fc3(x))))
        x = self.dropout(F.relu(self.bn4(self.fc4(x))))
        
        #no dropout at the output layer
        x = self.fc5(x)
        
        return x

In [None]:
model = Model()

In [95]:
# Loss Function and Optimizer

criterion = nn.CrossEntropyLoss()

from torch import optim

optimizer = optim.SGD(model.parameters(), lr=0.05)

In [96]:
# Creating LR scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [None]:
%%time
for epoch in range(1, 16): ## run the model for 15 epochs
    train_loss = []
    ## training part 
    model.train()
    scheduler.step() # for LR scheduler
    for data, target in train_loader:
        
        # Move input and label tensors to the avialable device
        data, target = data.to(device), target.to(device)
        
        #Reshaping the input data before sending into the model
        data = data.view(data.shape[0], -1)
        
        optimizer.zero_grad()
        
        ## 1. forward propagation
        output = model(data)
        
        ## 2. loss calculation
        loss = criterion(output, target)
        
        ## 3. backward propagation
        loss.backward()
        
        ## 4. weight optimization
        optimizer.step()
        
        train_loss.append(loss.item())
        
    print ("Epoch:", epoch, "Training Loss: ", np.mean(train_loss))