# Part II: Investigating the Vanishing Gradient Problem [20 pts]
Experimentally demonstrate the vanishing gradient problem in deep CNNs and understand how ResNet's architecture mitigates it. You will also explore other key CNN concepts through additional experiments.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#!cp "/content/gdrive/MyDrive/A1_Dataset/cnn_dataset.zip" /content/
#!unzip -q /content/cnn_dataset.zip -d /content/

unzip:  cannot find or open /content/cnn_dataset.zip, /content/cnn_dataset.zip.zip or /content/cnn_dataset.zip.ZIP.


In [None]:
!ls /content

gdrive	sample_data


In [None]:
#!mkdir -p /content/cnn_dataset
#!mv /content/dogs /content/cnn_dataset/
#!mv /content/food /content/cnn_dataset/
#!mv /content/vehicles /content/cnn_dataset/

In [11]:
from torchvision.datasets import ImageFolder
from torchvision import transforms
from collections import Counter
import torch
import torchvision
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Subset
import matplotlib.pyplot as plt
import numpy as np

dataPath = '/content/gdrive/MyDrive/Colab Notebooks/CSE 676 A1 Shivam Abhishek/cnn_dataset'
transform = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(), #Normalised pixel values to [0-1] from [0-255], RGB
    #Also normalising now since VGG works better with mean/std
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
#dataset = ImageFolder("/content/cnn_dataset", transform=transform)
dataset = ImageFolder(dataPath, transform=transform)

print("Total images:", len(dataset))
print("Classes:", dataset.classes)
print(Counter(dataset.targets))

KeyboardInterrupt: 

In [None]:
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #Based on here https://www.kaggle.com/code/subhajeetdas/1-pytorch-cuda-check
print(len(dataset.classes))
print(device)


In [None]:
from torch.utils.data import DataLoader, random_split

dataset_size = len(dataset)
train_size = int(0.70 * dataset_size)
val_size = int(0.15 * dataset_size)
test_size = dataset_size - train_size - val_size  # To ensure it sums exactly

train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_set,batch_size=32,shuffle=True)
val_loader   = DataLoader(val_set,batch_size=32,shuffle=False)
test_loader  = DataLoader(test_set,batch_size=32,shuffle=False)

In [12]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


## Step 1: Create a deeper version of your VGG-16 network (VGG-Deep)

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt

In [None]:
### ADD YOUR CODE HERE ###
#This is taken from Part 1's VGG:
import torch
import torch.nn as nn
import torch.optim as optim

class vggDeep(nn.Module):
    def __init__(self, numClasses):
        super(vggDeep, self).__init__()
        self.conv1 = nn.Conv2d(3,64,3,padding=1)
        self.conv2 = nn.Conv2d(64,64,3,padding=1)
        self.pool1 = nn.MaxPool2d(2,2)

        self.conv3 = nn.Conv2d(64,128,3,padding=1)
        self.conv4 = nn.Conv2d(128,128,3,padding=1)
        self.pool2 = nn.MaxPool2d(2,2)

        self.conv5 = nn.Conv2d(128,256,3,padding=1)
        self.conv6 = nn.Conv2d(256,256,3,padding=1)
        self.conv7 = nn.Conv2d(256,256,3,padding=1)
        self.conv8  = nn.Conv2d(256,512,3,padding=1)
        self.conv9  = nn.Conv2d(512,512,3,padding=1)
        self.conv10 = nn.Conv2d(512,512,3,padding=1)
        self.conv11 = nn.Conv2d(512,512,3,padding=1)

        self.pool3 = nn.MaxPool2d(2,2)

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc1 = nn.Linear(512,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256,numClasses)
        self.relu = nn.ReLU()


    def forward(self,x):
        out = self.relu(self.conv1(x))
        out = self.relu(self.conv2(out))
        out = self.pool1(out)
        out = self.relu(self.conv3(out))
        out = self.relu(self.conv4(out))
        out = self.pool2(out)
        out = self.relu(self.conv5(out))
        out = self.relu(self.conv6(out))
        out = self.relu(self.conv7(out))
       #Needed to add these 4
        out = self.relu(self.conv8(out))
        out = self.relu(self.conv9(out))
        out = self.relu(self.conv10(out))
        out = self.relu(self.conv11(out))

        out = self.pool3(out)
        out = self.avgpool(out)
        out = torch.flatten(out,1)
        out = self.relu(self.fc1(out))
        out = self.relu(self.fc2(out))
        out = self.fc3(out)
        return out


## Step 2: Training VGG-Deep

In [None]:
### ADD YOUR CODE HERE ###
import torch.optim as optim
#Based on Abhishek's code from Part 1
def weights_init_he(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

model = vggDeep(numClasses=3).to(device)
model.apply(weights_init_he)

optimizer = optim.SGD(model.parameters(),lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=5,gamma=0.5)
criterion = nn.CrossEntropyLoss()

In [13]:
#This is for the gradient tracking:
gradient_norms = {name: [] for name,_ in model.named_modules() if isinstance(_,nn.Conv2d)}

def gradient_hook(name):
    def hook(module, grad_input, grad_output):
        gradient_norms[name].append(grad_output[0].norm(p=2).item())
    return hook

for name, layer in model.named_modules():
    if isinstance(layer,nn.Conv2d):
        layer.register_full_backward_hook(gradient_hook(name))

NameError: name 'model_deep' is not defined

In [None]:
def train_model(model,epochs=15):
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for x,y in train_loader:
            x,y = x.to(device),y.to(device)

            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out,y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        scheduler.step()

        print("Epoch: ", (epoch+1))
        print("Loss: ", running_loss/len(train_loader))
train_model(model)

<span style='color:green'>### YOUR ANSWER ###</span>

## Step 3: Gradient analysis

- Track the average L2 norm of the gradients in each convolutional layer using PyTorch hooks.

In [None]:
### ADD YOUR CODE HERE ###
plt.figure()

for name,norms in gradient_norms.items():
    plt.plot(norms,label=name)

plt.legend()
plt.title("Gradient Norms")
plt.show()

- Create a plot showing the average gradient norm for each convolutional layer over time.

In [None]:
### ADD YOUR CODE HERE ###

- Create a separate plot showing the gradient norms for a subset of layers.

In [None]:
### ADD YOUR CODE HERE ###

<span style='color:green'>### YOUR ANSWER ###</span>

## Step 4: Comparison with VGG-16 and ResNet-18

- Compare the training curves (loss and accuracy vs. epoch) of VGG-Deep, VGG-16, and ResNet-18.

In [None]:
### ADD YOUR CODE HERE ###

- Discuss how ResNetâ€™s residual connections impact the gradient flow compared to VGG-Deep.

<span style='color:green'>### YOUR ANSWER ###</span>

## Step 5: Investigate and analyze more setups

- Select any THREE experiments to investigate and analyze.

### Experiment 1

In [None]:
### ADD YOUR CODE HERE ###

<span style='color:green'>### YOUR ANSWER ###</span>

### Experiment 2

In [None]:
### ADD YOUR CODE HERE ###

<span style='color:green'>### YOUR ANSWER ###</span>

### Experiment 3

In [None]:
### ADD YOUR CODE HERE ###

<span style='color:green'>### YOUR ANSWER ###</span>

## Step 6: Analysis and discussion

a. Analyze your gradient norm plots. Do they demonstrate the vanishing gradient problem? Explain how the gradient norm changes as you move deeper into VGG-Deep. Be specific and quantitative (e.g., "The gradient norm of layer 2 is X times larger than the gradient norm of layer 10").

<span style='color:green'>### YOUR ANSWER ###</span>

b. Explain why the vanishing gradient problem occurs in deep networks. Relate this to the backpropagation algorithm and the chain rule. Discuss how the repeated multiplication of small gradients can lead to extremely small values in earlier layers.

<span style='color:green'>### YOUR ANSWER ###</span>

c. Explain how ResNet's architecture (residual connections) helps alleviate the vanishing gradient problem. Explain how the identity mapping allows gradients to flow more easily through the network.

<span style='color:green'>### YOUR ANSWER ###</span>

d. Discuss the theoretical impact of batch normalization on the vanishing/exploding gradient problem. Explain how it helps stabilize and accelerate training.

<span style='color:green'>### YOUR ANSWER ###</span>

e. Summarize the key findings from your three chosen investigations.

<span style='color:green'>### YOUR ANSWER ###</span>

f. References. Include details on all the resources used to complete this part.

<span style='color:green'>### YOUR ANSWER ###</span>