# 1. Simple Dataset reading pipeline
Dataset is already downloaded. Hence this read takes <1 minutes compared to 20+ minutes from huggingface datasets

In [1]:
import torch
import torch.nn as nn
import torchvision, torchinfo, torchmetrics
import torchvision
from sklearn.model_selection import train_test_split
import os, glob, zipfile
from tqdm import tqdm
from PIL import Image

def DOWNLOAD_DATASETS():
    zip_files = ['test.zip', 'train.zip']

    for zip_file in zip_files:
        with zipfile.ZipFile("../input/dogs-vs-cats-redux-kernels-edition/{}".format(zip_file),"r") as z:
            z.extractall(".")
            print("{} unzipped".format(zip_file))

    train_file_names_list = glob.glob(os.path.join("../working/train",'*.jpg'))
    test_file_names_list  = glob.glob(os.path.join("../working/test", '*.jpg'))

    train_list, val_list  = train_test_split(train_file_names_list, test_size=0.2)

    transformation_list =  torchvision.transforms.Compose([
            torchvision.transforms.Resize((224, 224)),
            torchvision.transforms.ToTensor(),
        ])

    class Custom_Dataset(torch.utils.data.Dataset):
        def __init__(self,file_list,transformation_list = None):
            self.file_list = file_list
            self.transform = transformation_list

        def __len__(self):
            self.filelength = len(self.file_list)
            return self.filelength

        def __getitem__(self,idx):
            img_path = self.file_list[idx]
            img = Image.open(img_path)
            img_transformed = self.transform(img)

            label = img_path.split('/')[-1].split('.')[0]
            if label == 'dog':
                label=1
            elif label == 'cat':
                label=0

            return img_transformed,label

    train_dataset = Custom_Dataset(train_list, transformation_list)
    val_dataset   = Custom_Dataset(val_list  , transformation_list)

    train_loader  = torch.utils.data.DataLoader(dataset = train_dataset, batch_size=32, shuffle=True )
    val_loader    = torch.utils.data.DataLoader(dataset = val_dataset, batch_size=32, shuffle=True)
    
    return train_dataset, val_dataset, train_loader, val_loader


training_dataset, validation_dataset, training_dataloader, validation_dataloader = DOWNLOAD_DATASETS();
assert next(iter(training_dataloader)) is not None
assert next(iter(validation_dataloader)) is not None

test.zip unzipped
train.zip unzipped


```python
# THIS TAKES at least 10 minutes to process vs Pytorch reading is a lot faster
import datasets

dataset_from_hg            = datasets.load_dataset("microsoft/cats_vs_dogs", split="train", ignore_verifications= True )
def transform_datasets(examples):
    examples["image_tensors"] = []
    for image in examples['image']:
        transformed_image = transformations_group(image)
        examples['image_tensors'].append(transformed_image)

    return examples

dataset_from_hg = dataset_from_hg   .map(transform_datasets  , batched= True)
```

# 2. Simple Model Training Pipeline

In [3]:
lr      = 0.001 # learning_rate
epochs  = 10 # How much to train a model
device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def TRAIN_MODEL(model, training_dataloader, validation_dataloader):
    
    model.train(mode=True)
    OPTIMIZER = torch.optim.SGD ( params= model.parameters(), lr= lr ) # Using torch.optimizer algorithm
    metric    = torchmetrics.Accuracy(task="multiclass", num_classes= 2 ).to(device)
    
    for epoch_no in range(epochs):
        for batch_no, (image_tensors, labels) in enumerate(progress_bar := tqdm(training_dataloader)):
            
            x_actual, y_actual = image_tensors.to(device), labels.to(device)
            
            y_predicted_LOGITS = model.forward               (x_actual)
            y_predicted_probs  = nn.functional.softmax       (y_predicted_LOGITS, dim= 1)
            loss               = nn.functional.cross_entropy (y_predicted_LOGITS, y_actual.to(torch.int64))
            
            OPTIMIZER.zero_grad()
            loss.backward()
            # dError_dParameters    = torch.autograd.grad( outputs = ERROR_FUNC( y_predicted, y_actual ), inputs = model.parameters())
            # Parameters of layer 1 are not dependent on any other parameters
            # Parameters of layer 2 are dependent on layer 1 parameters
            # Parameters of layer 3 are dependent on layer 2 parameters which are dependent on layer 1 parameters
            # Finding complicated rate of change of such nested parameters is done automatically when we do loss.backward()
            OPTIMIZER.step()
            """
            for (name, weight), gradient in zip(model.named_parameters(), dError_dWeights):
                weight = weight - gradient * LEARNING_RATE
                print(f"Parameters of layer: {name} have these many {torch.count_nonzero(gradient)} updates out of {torch.count(gradient)})
            """

            loss_batch      = loss.item()
            accuracy_batch  = metric(y_predicted_LOGITS, y_actual)
            train_acc_epoch = metric.compute() # calculates average accuracy across epoch automatically

            metrics_per_batch = {
                "loss_batch": loss_batch,
                "accuracy_running_average": train_acc_epoch,
            }
            progress_bar.set_description(f'batch_no = {batch_no},\t loss_batch = {loss_batch:0.4f},\t accuracy_avg = {train_acc_epoch:0.4f}')
            
        metric.reset()
        loss_validation, accuracy_validation = EVALUATE_MODEL(model, validation_dataloader)
        print(f'epoch_no = {epoch_no}, training_loss = {loss_batch:0.4f}, validation_loss = {loss_validation:0.4f},\t training_accuracy = {accuracy_batch:0.4f}, validation_accuracy = {accuracy_validation:0.4f}')
    
    model.train(mode=False)


def EVALUATE_MODEL(model, validation_dataloader):
    # EVALUATE MODEL AT END OF EVERY EPOCH
    model.eval()
    metric = torchmetrics.Accuracy(task="multiclass", num_classes= 2 ).to(device)
    with torch.no_grad():
        for batch_no, (image_tensors, labels) in enumerate(validation_dataloader):
            x_actual, y_actual = image_tensors.to(device), labels.to(device)

            y_predicted_LOGITS = model.forward                 (x_actual)
            loss               = nn.functional.cross_entropy   (y_predicted_LOGITS, y_actual.to(torch.int64)).item()
            accuracy_batch     = metric                        (y_predicted_LOGITS, y_actual).item()

        testing_accuracy_avg = metric.compute().item()
    return loss, testing_accuracy_avg

# 3. Model Architecture Experiments

## Experiment 1

In [None]:
feature_extractor = nn.Sequential(
    # 224, 224
    nn.Conv2d                           ( in_channels =  3, out_channels = 50, kernel_size = (3,3), padding="same"), 
    nn.ReLU(), nn.MaxPool2d ((2,2), 2),                                           
    # 112, 112
    nn.Conv2d                           ( in_channels = 50, out_channels = 50, kernel_size = (3,3), padding="same"),
    nn.ReLU(), nn.MaxPool2d ((2,2), 2),
    # 56, 56
    nn.Conv2d                           ( in_channels = 50, out_channels = 50, kernel_size = (3,3), padding="same"),
    nn.ReLU(), nn.MaxPool2d ((2,2), 2),
    # 28, 28
    nn.Conv2d                           ( in_channels = 50, out_channels = 50, kernel_size = (3,3), padding="same"),
    nn.ReLU(), nn.MaxPool2d ((2,2), 2),
    # 14, 14
    nn.Conv2d                           ( in_channels = 50, out_channels = 512, kernel_size = (3,3), padding="same"),
    nn.ReLU(), nn.MaxPool2d ((2,2), 2),
    # 7, 7
)

decision_maker = nn.Sequential(
    nn.Flatten(start_dim=1),
    nn.Linear(in_features = 512*7*7 , out_features = 50), nn.ReLU(),
    nn.Linear(in_features = 50      , out_features = 2)
)

model = nn.Sequential(
  feature_extractor,
  decision_maker
)
model = model.to(device)

test_example = torch.randn((1,3,224,224)).to(device)
print(feature_extractor(test_example).shape, model(test_example).shape)

In [None]:
TRAIN_MODEL (model, training_dataloader, validation_dataloader)

## Experiment 2

In [5]:
class Cnn(nn.Module):
    def __init__(self):
        super(Cnn,self).__init__()
        
        # Size Reduction 1/4th
        self.layer1 = nn.Sequential(
            nn.Conv2d(3,16, kernel_size=3, padding=0,stride=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        # Size Reduction 1/4th
        self.layer2 = nn.Sequential(
            nn.Conv2d(16,32, kernel_size=3, padding=0, stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
            )
        
        # Size Reduction 1/4th
        self.layer3 = nn.Sequential(
            nn.Conv2d(32,64, kernel_size=3, padding=0, stride=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        
        self.fc1     = nn.Linear(3*3*64,10)
        self.dropout = nn.Dropout(0.5)
        self.fc2     = nn.Linear(10,2)
        self.relu    = nn.ReLU()
        
        
    def forward(self,x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = out.view(out.size(0),-1)
        out = self.relu(self.fc1(out))
        out = self.fc2(out)
        return out

model = Cnn().to(device)

In [None]:
TRAIN_MODEL (model, training_dataloader, validation_dataloader)

In [7]:
torchinfo.summary(model, input_size= (1,3, 224, 224), verbose=1);

Layer (type:depth-idx)                   Output Shape              Param #
Cnn                                      [1, 2]                    --
├─Sequential: 1-1                        [1, 16, 55, 55]           --
│    └─Conv2d: 2-1                       [1, 16, 111, 111]         448
│    └─BatchNorm2d: 2-2                  [1, 16, 111, 111]         32
│    └─ReLU: 2-3                         [1, 16, 111, 111]         --
│    └─MaxPool2d: 2-4                    [1, 16, 55, 55]           --
├─Sequential: 1-2                        [1, 32, 13, 13]           --
│    └─Conv2d: 2-5                       [1, 32, 27, 27]           4,640
│    └─BatchNorm2d: 2-6                  [1, 32, 27, 27]           64
│    └─ReLU: 2-7                         [1, 32, 27, 27]           --
│    └─MaxPool2d: 2-8                    [1, 32, 13, 13]           --
├─Sequential: 1-3                        [1, 64, 3, 3]             --
│    └─Conv2d: 2-9                       [1, 64, 6, 6]             18,496
│    └─

## Image input & output size after Convolution
$$
\begin{align*}
img &= (H,W,C)\\
kernel &= (k,k,C)\\
output &= (reduced, reduced, 1)\\
\\
height_{output} &= \frac {(height_{input} + 2 \cdot padding - (kernel - 1)  )}{stride} \tag{1}\\
\end{align*}
$$

## Receptive Field
$$
\begin{align*}
rf^{global} &= rf_{input} + (kernel - 1) \cdot stride \cdot s_{accum}\\
\\
s_{accum} &= s_{accum} \cdot stride\\
\\
depth &= \frac {width} {(kernel -1) \cdot s \cdot s_{accum}}
\end{align*}
$$

### Padding required for same output size
Stride = 1, padding for same height
$$
\begin{align*}
s &= 1 \\
padding &= \frac{( kernel - 1 )}{2}\\
\Delta rf &= kernel - 1\\
depth &= \frac{width}{kernel - 1}\\
\end{align*}
$$


## Going back to 28*28 size for Architecture experiementation

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv0 = nn.Conv2d(1,32, 3, padding=1)    # 28 -> 28 | 3

        self.conv1 = nn.Conv2d(32, 64, 3, padding=1)  # 28 -> 28 | 5
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1) # 28 -> 28 | 7
        self.pool1 = nn.MaxPool2d(2, 2)               # 28 > 14  | 14

        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)  # 14 -> 14 | 16
        self.conv4 = nn.Conv2d(256, 512, 3, padding=1)  # 14 -> 14 | 18
        self.pool2 = nn.MaxPool2d(2, 2)                 # 14 -> 7  | 36

        self.conv5 = nn.Conv2d(512, 1024, 3)     # 7 -> 5 | 38
        self.conv6 = nn.Conv2d(1024, 1024, 3)    # 5 -> 3 | 40

        self.conv7 = nn.Conv2d(1024, 10, 3)     # 3 -> 1  | 42

    def forward(self, x):
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        # x = F.relu(self.conv7(x))
        x = self.conv7(x)
        x = x.view(-1, 10) #1x1x10> 10
        return F.log_softmax(x, dim=-1)

In [None]:
class Model_1(nn.Module):
    def __init__(self):
        super().__init__()

        # k:3, RF: 3 | Output: 28 -> 28 | Channels: (1 -> 32)
        # Layer 1: Parameters = 32 * (1 * 3*3)
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)

        # k:3, RF: 5 | Output: 28 -> 28 | Channels: (32 -> 64)
        # Layer 2: Parameters = 64 * (32 * 3*3)   
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)

        # k:2, RF: 10 | Output: 28 -> 14 | Channels: (64 -> 64)
        # Layer 3: Parameters = 0
        self.pool1 = nn.MaxPool2d(2, 2)

        # k:3, RF: 12 | Output: 14 -> 14 | Channels: (64 -> 128)
        # Layer 4: Parameters = 128 * (64 * 3*3)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)

        # k:3, RF: 14 | Output: 14 -> 14 | Channels: (128 -> 256)
        # Layer 5: Parameters = 256 * (128 * 3*3)
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)

        # k:2, RF: 28 | Output: 14 -> 7 | Channels: (256 -> 256)
        # Layer 6: Parameters = 0
        self.pool2 = nn.MaxPool2d(2, 2)

        # k:3, RF: 30 | Output:  7 -> 5 | Channels: (256 -> 512)
        # Layer 7: Parameters = 512 * (256 * 3*3)           
        self.conv5 = nn.Conv2d(256, 512, 3)

        # k:3, RF: 32 | Output:  5 -> 3 | Channels: (512 -> 1024)
        # Layer 8: Parameters = 1024 * (512 * 3*3)        
        self.conv6 = nn.Conv2d(512, 1024, 3)

        # k:3, RF: 34 | Output:  3 -> 1 | Channels (1 -> 32)    
        # Layer 9: Parameters = 1024 * (10 * 3*3)      
        self.conv7 = nn.Conv2d(1024, 10, 3)             

    def forward(self, x):
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        # x = F.relu(self.conv7(x))
        x = self.conv7(x)

        # 1*1*10 > (-1,10)
        x = x.view(-1, 10)                              
        # -1 means last dimention which is dim 1 in this case
        # Two dimentions. dim 0 & dim 1
        output = F.log_softmax(x, dim= 1) # OR F.log_softmax(x, dim=-1)
        probs = F.softmax(x,dim=1)
        
        return output

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Model_2(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv0 = nn.Conv2d(1,32, 3, padding=1)    # 28 -> 28 | 3
        
        # Block 1
        self.block1 = nn.ModuleDict({
            "conv1": nn.Conv2d(32, 64, 3, padding=1),
            "relu1" : nn.ReLU(),
            "conv2": nn.Conv2d(64, 128, 3, padding=1),
            "relu2" : nn.ReLU(),
        })

        # Maxpooling before or after 1x1 convolution?
        self.transition1 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128,32,1), # Squeeze
        )

        # Block 2
        self.block2 = nn.ModuleDict({
            "conv1": nn.Conv2d(32, 64, 3, padding=1),
            "relu1" : nn.ReLU(),
            "conv2": nn.Conv2d(64, 128, 3, padding=1),
            "relu2" : nn.ReLU(),
        })
        
        self.transition2 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128,32,1), # Squeeze
        )

        # Block 3
        self.block3 = nn.ModuleDict({
            "conv1": nn.Conv2d(32, 64, 3, padding=1),
            "relu1" : nn.ReLU(),
            "conv2": nn.Conv2d(64, 128, 3, padding=1),
            "relu2" : nn.ReLU(),
        })

        self.transition3 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128,32,1), # Squeeze
        )

        # Block 4
        self.block4 = nn.ModuleDict({
            "conv7": nn.Conv2d(32, 10, 3),
        })

    def forward(self, x):
        b1, b2, b3, b4 = self.block1, self.block2, self.block3, self.block4

        x = self.conv0(x)

        x = b1.relu2(b1.conv2(b1.relu1(b1.conv1(x))))
        x = self.transition1(x)
        
        x = b2.relu2(b2.conv2(b2.relu1(b2.conv1(x))))
        x = self.transition2(x)
        
        x = b3.relu2(b3.conv2(b3.relu1(b3.conv1(x))))
        x = self.transition3(x)

        x = b4.conv7(x)

        # (-1 = dim 0, 10 = dim 1)
        x = x.view(-1, 10)

        output = F.log_softmax(x, dim=1)
        probs = F.softmax(x, dim=1)
        return output

if __name__=="__main__":
    # from torchsummary import summary

    model_2 = Model_2()

    for name in model_2.state_dict():
        print(name)
    
    for module in model_2.modules():
        print(module)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

import torch.nn as nn
import torch.nn.functional as F

class Model_3(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv0 = nn.Conv2d(1,10, 3, padding=1)    # 28 -> 28 | 3
        
        # Block 1
        self.block1 = nn.ModuleDict({
            "conv1": nn.Conv2d(10, 10, 3, padding=1),
            "relu1" : nn.ReLU(),
            "conv2": nn.Conv2d(10, 20, 3, padding=1),
            "relu2" : nn.ReLU(),
        })

        # Maxpooling before or after 1x1 convolution?
        self.transition1 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(20,10,1), # Squeeze
        )

        # Block 2
        self.block2 = nn.ModuleDict({
            "conv1": nn.Conv2d(10, 10, 3, padding=1),
            "relu1" : nn.ReLU(),
            "conv2": nn.Conv2d(10, 20, 3, padding=1),
            "relu2" : nn.ReLU(),
        })
        
        self.transition2 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(20,10,1), # Squeeze
        )

        # Block 3
        self.block3 = nn.ModuleDict({
            "conv1": nn.Conv2d(10, 10, 3, padding=1),
            "relu1" : nn.ReLU(),
            "conv2": nn.Conv2d(10, 20, 3, padding=1),
            "relu2" : nn.ReLU(),
        })

        self.transition3 = nn.Sequential(
            nn.MaxPool2d(2, 2),
            nn.Conv2d(20,10,1), # Squeeze
        )

        # Block 4
        self.block4 = nn.ModuleDict({
            "conv7": nn.Conv2d(10, 10, 3),
        })

    def forward(self, x):
        b1, b2, b3, b4 = self.block1, self.block2, self.block3, self.block4

        x = self.conv0(x)

        x = b1.relu2(b1.conv2(b1.relu1(b1.conv1(x))))
        x = self.transition1(x)
        
        x = b2.relu2(b2.conv2(b2.relu1(b2.conv1(x))))
        x = self.transition2(x)
        
        x = b3.relu2(b3.conv2(b3.relu1(b3.conv1(x))))
        x = self.transition3(x)

        x = b4.conv7(x)

        # (-1 = dim 0, 10 = dim 1)
        x = x.view(-1, 10)

        output = F.log_softmax(x, dim=1)
        probs = F.softmax(x, dim=1)
        return output

if __name__=="__main__":
    from torchsummary import summary

    model_3 = Model_3()
    summary(model_3,input_size=(1,28,28))

    for name in model_3.state_dict():
        print(name)
    
    # for module in model_3.modules():
        # print(module)