<a href="https://colab.research.google.com/github/abeebyekeen/DLforBeginners/blob/main/overfitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We will delve into one of the most crucial aspects of training neural networks: understanding and addressing overfitting.

# **What is Overfitting?**
**Overfitting** is a common problem in training neural networks where developed models perform well not just on our training data but also on unseen data.

In this section, we will see an example of overfitting problem and how we will address it.

In [None]:
# box 2.1
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import copy

In [None]:
# box 2.2
# 1. Generate  Dataset
X, y = make_moons(n_samples=300, noise=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = map(torch.tensor, (X_train, X_val, y_train, y_val))
y_train, y_val = y_train.float(), y_val.float()

# Plot the scatter of the data
import numpy as np
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
h = 0.02

xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                      np.arange(y_min, y_max, h))
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='blue', label='Class 0')
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='red', label='Class 1')

plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# box 2.3
# 2. Create a Deep Neural Network
class DeepNet(nn.Module):
    def __init__(self, dropout_rate=0.0):
        super(DeepNet, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(), #Uncomment  ('ctrl'+'/') this to have non-linear classifier
            nn.Dropout(dropout_rate), #Please ignore this layer if dropout_rate = 0
            nn.Linear(64, 64),
            nn.ReLU(), #Uncomment  ('ctrl'+'/') this to have non-linear classifier
            nn.Dropout(dropout_rate), #Please ignore this layer if dropout_rate = 0
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)
# Summarize the model and its output
from torchsummary import summary
model = DeepNet()
summary(model, (1,2))

In [None]:
# box 2.4
model_without_reg = DeepNet()

# Training Settings
optimizer = optim.Adam(model_without_reg.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

# 3. Train the Model (Without Regularization) and Observe Overfitting
num_epochs = 500
train_losses, val_losses = [], []
saved_models = []

for epoch in range(num_epochs):
    # Train
    model_without_reg.train()
    optimizer.zero_grad()
    predictions = model_without_reg(X_train.float()).squeeze()
    loss = loss_fn(predictions, y_train)
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

    #save model at epoch 0, 100, 200, 300, 400
    if epoch % 100 == 0:
        model_state = copy.deepcopy(model_without_reg).state_dict()
        saved_models.append(model_state)
    # Validate
    model_without_reg.eval()
    with torch.no_grad():
        val_predictions = model_without_reg(X_val.float()).squeeze()
        val_loss = loss_fn(val_predictions, y_val)
        val_losses.append(val_loss.item())

plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.legend()
plt.title("Without Regularization")
plt.show()

Let's plot the decision boundary to understand the overfitting problem

In [None]:
# box 2.5
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons


# Function to plot the decision boundary
def plot_decision_boundary(model, X, y):
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    h = 0.02

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Predict the function value for the entire grid
    model.eval()
    with torch.no_grad():
        Z = model(torch.tensor(np.c_[xx.ravel(), yy.ravel()]).float()).squeeze()
        Z = Z.reshape(xx.shape)

    # Plot the contour with only two regions
    plt.contourf(xx, yy, Z.detach().numpy(), levels=[0, 0.5, 1], cmap=plt.cm.Spectral, alpha=0.8)
    plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='blue', label='Class 0')
    plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='red', label='Class 1')

    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.grid(True)
    plt.show()

plt.title('Decision Boundary - Training')
plot_decision_boundary(model_without_reg, X_train, y_train)


plt.title('Decision Boundary - Validation')
plot_decision_boundary(model_without_reg, X_val, y_val)

Let's observe the boundary decision and magnitude of weight over epochs

In [None]:
# box 2.6
for i in range(len(saved_models)):
  model_to_load = DeepNet()

  model_state = saved_models[i]
  # Load the saved model state into the model instance
  model_to_load.load_state_dict(model_state)
  max_weight_magnitude = max(torch.max(torch.abs(param)).item() for param in model_to_load.parameters() if param.requires_grad)
  plt.title(f'Decision Boundary - Validation - epoch  {i*100} - max weight magnitude {max_weight_magnitude}')
  plot_decision_boundary(model_to_load, X_val, y_val)

# **Combat overfitting**
Here are a few methods used to combat overfitting:

* Data Augmentation: Increasing the size and diversity of the training dataset

* **Regularization**: Regularization techniques add a penalty term to the loss function to discourage complex models.

* **Dropout**: A technique used in neural networks where randomly selected neurons are ignored during training.

* Early Stopping: Involves stopping the training process before the model fully fits the training data.

* Cross-Validation: Involves dividing the dataset into subsets and using them in rotating combinations to train and validate the model.

Today, we will try to use regularization and dropout method to address the overfitting problem.




In [None]:
# box 2.7
# 4. Apply Regularization (L2 + Dropout) and Observe Reduced Overfitting
model_with_reg = DeepNet(dropout_rate=0.0)  # Add Dropout
optimizer = optim.Adam(model_with_reg.parameters(), lr=0.01, weight_decay=0.01)  # L2 regularization with weight decay
train_losses_reg, val_losses_reg = [], []

for epoch in range(num_epochs):
    # Train
    model_with_reg.train()
    optimizer.zero_grad()
    predictions = model_with_reg(X_train.float()).squeeze()
    loss = loss_fn(predictions, y_train)
    loss.backward()
    optimizer.step()

    train_losses_reg.append(loss.item())

    # Validate
    model_with_reg.eval()
    with torch.no_grad():
        val_predictions = model_with_reg(X_val.float()).squeeze()
        val_loss = loss_fn(val_predictions, y_val)
        val_losses_reg.append(val_loss.item())

plt.plot(train_losses_reg, label='Training Loss with Regularization')
plt.plot(val_losses_reg, label='Validation Loss with Regularization')
plt.legend()
plt.title("With Regularization")
plt.show()


Now we will plot the decision boundary to observe how regularization helps to remove overfitting

In [None]:

# box 2.8

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons


# Function to plot the decision boundary
def plot_decision_boundary(model, X, y):
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    h = 0.02

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Predict the function value for the entire grid
    model.eval()
    with torch.no_grad():
        Z = model(torch.tensor(np.c_[xx.ravel(), yy.ravel()]).float()).squeeze()
        Z = Z.reshape(xx.shape)

    # Plot the contour with only two regions
    plt.contourf(xx, yy, Z.detach().numpy(), levels=[0, 0.5, 1], cmap=plt.cm.Spectral, alpha=0.8)
    plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='blue', label='Class 0')
    plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='red', label='Class 1')

    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.grid(True)
    plt.show()

plt.title('Decision Boundary Without Regularization - Training')
plot_decision_boundary(model_without_reg, X_train, y_train)


plt.title('Decision Boundary Without Regularization - Validation')
plot_decision_boundary(model_without_reg, X_val, y_val)


plt.title('Decision Boundary With Regularization - Training')
plot_decision_boundary(model_with_reg, X_train, y_train)

plt.title('Decision Boundary With Regularization - Validation')
plot_decision_boundary(model_with_reg, X_val, y_val)


In [None]:
# box 2.9
max_weight_magnitude = max(torch.max(torch.abs(param)).item() for param in model_without_reg.parameters() if param.requires_grad)
print('maximum magnitude of parameters (weights) in model without regularization is ', max_weight_magnitude)
print('training loss without regularization: ', train_losses[-1])
print('testing/validation loss without regularization: ', val_losses[-1])
max_weight_magnitude = max(torch.max(torch.abs(param)).item() for param in model_with_reg.parameters() if param.requires_grad)
print('maximum magnitude of parameters (weights) in model with regularization is ', max_weight_magnitude)
print('training loss with regularization: ', train_losses_reg[-1])
print('testing/validation loss with regularization: ', val_losses_reg[-1])
