In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import plotly.graph_objects as go

# Simple Neural Network for demonstration
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(2, 10)
        self.fc2 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Loss function
criterion = nn.MSELoss()

# Generate some random data
torch.manual_seed(0)
X = torch.randn(100, 2)
y = torch.randn(100, 1)

# Initialize the model, optimizer
model = SimpleNN()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Print initial loss before training
outputs = model(X)
initial_loss = criterion(outputs, y)
print(f"Initial Loss: {initial_loss.item()}")

# Train the model for a few epochs, printing loss after each step
for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()

    # Print the loss value after each optimization step
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Extract the parameters of the network
params = list(model.parameters())
param_vector = torch.cat([p.flatten() for p in params])

# Define random directions in parameter space
np.random.seed(0)
direction_1 = torch.randn_like(param_vector)
direction_2 = torch.randn_like(param_vector)

# Normalize directions
direction_1 /= torch.norm(direction_1)
direction_2 /= torch.norm(direction_2)

# Grid range
alpha_range = np.linspace(-2, 2, 50)
beta_range = np.linspace(-2, 2, 50)

# Compute loss surface
loss_surface = np.zeros((50, 50))

for i, alpha in enumerate(alpha_range):
    for j, beta in enumerate(beta_range):
        # Perturb the model parameters along the two directions
        perturbed_params = param_vector + alpha * direction_1 + beta * direction_2

        # Assign the perturbed parameters back to the model
        index = 0
        with torch.no_grad():
            for param in model.parameters():
                numel = param.numel()
                param.copy_(perturbed_params[index:index + numel].view(param.size()))
                index += numel

        # Calculate the loss with perturbed parameters
        outputs = model(X)
        loss = criterion(outputs, y)
        loss_surface[i, j] = loss.item()

# Plotting the interactive loss surface using Plotly
alpha_grid, beta_grid = np.meshgrid(alpha_range, beta_range)

fig = go.Figure(data=[go.Surface(z=loss_surface, x=alpha_grid, y=beta_grid, colorscale='Viridis')])

# Add labels and title
fig.update_layout(
    title="Interactive Loss Surface Landscape",
    scene=dict(
        xaxis_title='Direction 1',
        yaxis_title='Direction 2',
        zaxis_title='Loss'
    ),
    autosize=False,
    width=800,
    height=800
)

# Show the interactive plot
fig.show()


Initial Loss: 0.9860835075378418
Epoch 1, Loss: 0.9860835075378418
Epoch 2, Loss: 0.9803861975669861
Epoch 3, Loss: 0.9753062725067139
Epoch 4, Loss: 0.9707737565040588
Epoch 5, Loss: 0.9667322635650635
Epoch 6, Loss: 0.9631323218345642
Epoch 7, Loss: 0.9599182605743408
Epoch 8, Loss: 0.9570472240447998
Epoch 9, Loss: 0.954487681388855
Epoch 10, Loss: 0.9521985054016113
Epoch 11, Loss: 0.9501493573188782
Epoch 12, Loss: 0.9483121633529663
Epoch 13, Loss: 0.9466648697853088
Epoch 14, Loss: 0.9451870918273926
Epoch 15, Loss: 0.9438632726669312
Epoch 16, Loss: 0.9426727890968323
Epoch 17, Loss: 0.9416009783744812
Epoch 18, Loss: 0.940634548664093
Epoch 19, Loss: 0.9397618770599365
Epoch 20, Loss: 0.9389727115631104
Epoch 21, Loss: 0.9382578134536743
Epoch 22, Loss: 0.937609076499939
Epoch 23, Loss: 0.9370189905166626
Epoch 24, Loss: 0.9364811182022095
Epoch 25, Loss: 0.9359900951385498
Epoch 26, Loss: 0.9355403780937195
Epoch 27, Loss: 0.9351277351379395
Epoch 28, Loss: 0.9347482323646545

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import plotly.graph_objects as go

# Define a neural network with 5 layers
class FiveLayerNN(nn.Module):
    def __init__(self):
        super(FiveLayerNN, self).__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)
        return x

# Define a neural network with 10 layers
class TenLayerNN(nn.Module):
    def __init__(self):
        super(TenLayerNN, self).__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 64)
        self.fc6 = nn.Linear(64, 64)
        self.fc7 = nn.Linear(64, 32)
        self.fc8 = nn.Linear(32, 32)
        self.fc9 = nn.Linear(32, 16)
        self.fc10 = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        x = torch.relu(self.fc7(x))
        x = torch.relu(self.fc8(x))
        x = torch.relu(self.fc9(x))
        x = self.fc10(x)
        return x

# Choose model: FiveLayerNN() or TenLayerNN()
model = FiveLayerNN()  # Replace with TenLayerNN() to switch to 10 layers

# Loss function
criterion = nn.MSELoss()

# Generate some random data
torch.manual_seed(0)
X = torch.randn(100, 2)
y = torch.randn(100, 1)

# Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Print initial loss before training
outputs = model(X)
initial_loss = criterion(outputs, y)
# print(f"Initial Loss: {initial_loss.item()}")

# Train the model for a few epochs, printing loss after each step
for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()

    # Print the loss value after each optimization step
    # print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Extract the parameters of the network
params = list(model.parameters())
param_vector = torch.cat([p.flatten() for p in params])

# Define random directions in parameter space
np.random.seed(0)
direction_1 = torch.randn_like(param_vector)
direction_2 = torch.randn_like(param_vector)

# Normalize directions
direction_1 /= torch.norm(direction_1)
direction_2 /= torch.norm(direction_2)

# Grid range
alpha_range = np.linspace(-2, 2, 50)
beta_range = np.linspace(-2, 2, 50)

# Compute loss surface
loss_surface = np.zeros((50, 50))

for i, alpha in enumerate(alpha_range):
    for j, beta in enumerate(beta_range):
        # Perturb the model parameters along the two directions
        perturbed_params = param_vector + alpha * direction_1 + beta * direction_2

        # Assign the perturbed parameters back to the model
        index = 0
        with torch.no_grad():
            for param in model.parameters():
                numel = param.numel()
                param.copy_(perturbed_params[index:index + numel].view(param.size()))
                index += numel

        # Calculate the loss with perturbed parameters
        outputs = model(X)
        loss = criterion(outputs, y)
        loss_surface[i, j] = loss.item()

# Plotting the interactive loss surface using Plotly
alpha_grid, beta_grid = np.meshgrid(alpha_range, beta_range)

fig = go.Figure(data=[go.Surface(z=loss_surface, x=alpha_grid, y=beta_grid, colorscale='Viridis')])

# Add labels and title
fig.update_layout(
    title="Interactive Loss Surface Landscape - 5 layer",
    scene=dict(
        xaxis_title='Direction 1',
        yaxis_title='Direction 2',
        zaxis_title='Loss'
    ),
    autosize=False,
    width=800,
    height=800
)

# Show the interactive plot
fig.show()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import numpy as np
import plotly.graph_objects as go
from torchsummary import summary

# Define ResNet block
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

# Define ResNet-56
class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_channels = 16
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
        self.linear = nn.Linear(64, num_classes)

    def _make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

# Create the ResNet-56 model
def ResNet56():
    return ResNet(BasicBlock, [9, 9, 9])

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)

# Initialize the model, loss function, and optimizer
model = ResNet56().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

# Print model summary (optional)
summary(model, (3, 32, 32))

# Train the model for a few epochs
num_epochs = 1  # Just for demonstration, you can increase this for better results
for epoch in range(num_epochs):
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

# Extract the parameters of the network
params = list(model.parameters())
param_vector = torch.cat([p.flatten() for p in params]).detach()

# Define random directions in parameter space
np.random.seed(0)
direction_1 = torch.randn_like(param_vector).cuda()
direction_2 = torch.randn_like(param_vector).cuda()

# Normalize directions
direction_1 /= torch.norm(direction_1)
direction_2 /= torch.norm(direction_2)

# Grid range
alpha_range = np.linspace(-2, 2, 25)
beta_range = np.linspace(-2, 2, 25)

# Compute loss surface
loss_surface = np.zeros((25, 25))

for i, alpha in enumerate(alpha_range):
    for j, beta in enumerate(beta_range):
        # Perturb the model parameters along the two directions
        perturbed_params = param_vector + alpha * direction_1 + beta * direction_2

        # Assign the perturbed parameters back to the model
        index = 0
        with torch.no_grad():
            for param in model.parameters():
                numel = param.numel()
                param.copy_(perturbed_params[index:index + numel].view(param.size()))
                index += numel

        # Calculate the loss with perturbed parameters
        inputs, targets = next(iter(train_loader))  # Get a random batch
        inputs, targets = inputs.cuda(), targets.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss_surface[i, j] = loss.item()

# Plotting the interactive loss surface using Plotly
alpha_grid, beta_grid = np.meshgrid(alpha_range, beta_range)

fig = go.Figure(data=[go.Surface(z=loss_surface, x=alpha_grid, y=beta_grid, colorscale='Viridis')])

# Add labels and title
fig.update_layout(
    title="Interactive Loss Surface Landscape (ResNet-56)",
    scene=dict(
        xaxis_title='Direction 1',
        yaxis_title='Direction 2',
        zaxis_title='Loss'
    ),
    autosize=False,
    width=800,
    height=800
)

# Show the interactive plot
fig.show()


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:04<00:00, 41194101.26it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]             432
       BatchNorm2d-2           [-1, 16, 32, 32]              32
            Conv2d-3           [-1, 16, 32, 32]           2,304
       BatchNorm2d-4           [-1, 16, 32, 32]              32
            Conv2d-5           [-1, 16, 32, 32]           2,304
       BatchNorm2d-6           [-1, 16, 32, 32]              32
        BasicBlock-7           [-1, 16, 32, 32]               0
            Conv2d-8           [-1, 16, 32, 32]           2,304
       BatchNorm2d-9           [-1, 16, 32, 32]              32
           Conv2d-10           [-1, 16, 32, 32]           2,304
      BatchNorm2d-11           [-1, 16, 32, 32]              32
       BasicBlock-12           [-1, 16, 32, 32]               0
           Conv2d-13           [-1, 16, 32, 32]     

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import plotly.graph_objects as go
from torchsummary import summary
import math

# Define DenseNet block
class Bottleneck(nn.Module):
    def __init__(self, in_channels, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(in_channels, 4 * growth_rate, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4 * growth_rate)
        self.conv2 = nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat([out, x], 1)
        return out

class Transition(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_channels)
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.pool = nn.AvgPool2d(2)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = self.pool(out)
        return out

class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2 * growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0] * growth_rate
        out_planes = int(math.floor(num_planes * reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1] * growth_rate
        out_planes = int(math.floor(num_planes * reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2] * growth_rate
        out_planes = int(math.floor(num_planes * reduction))
        self.trans3 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        num_planes += nblocks[3] * growth_rate

        self.bn = nn.BatchNorm2d(num_planes)
        self.linear = nn.Linear(num_planes, num_classes)

    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.trans3(self.dense3(out))
        out = self.dense4(out)
        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

# Create the DenseNet model (e.g., DenseNet-121)
def DenseNet121():
    return DenseNet(Bottleneck, [6, 12, 24, 16], growth_rate=32)

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)

# Initialize the model, loss function, and optimizer
model = DenseNet121().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

# Print model summary (optional)
summary(model, (3, 32, 32))

# Train the model for a few epochs
num_epochs = 1  # Just for demonstration, you can increase this for better results
for epoch in range(num_epochs):
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

# Extract the parameters of the network
params = list(model.parameters())
param_vector = torch.cat([p.flatten() for p in params]).detach()

# Define random directions in parameter space
np.random.seed(0)
direction_1 = torch.randn_like(param_vector).cuda()
direction_2 = torch.randn_like(param_vector).cuda()

# Normalize directions
direction_1 /= torch.norm(direction_1)
direction_2 /= torch.norm(direction_2)

# Grid range
alpha_range = np.linspace(-2, 2, 25)
beta_range = np.linspace(-2, 2, 25)

# Compute loss surface
loss_surface = np.zeros((25, 25))

for i, alpha in enumerate(alpha_range):
    for j, beta in enumerate(beta_range):
        # Perturb the model parameters along the two directions
        perturbed_params = param_vector + alpha * direction_1 + beta * direction_2

        # Assign the perturbed parameters back to the model
        index = 0
        with torch.no_grad():
            for param in model.parameters():
                numel = param.numel()
                param.copy_(perturbed_params[index:index + numel].view(param.size()))
                index += numel

        # Calculate the loss with perturbed parameters
        inputs, targets = next(iter(train_loader))  # Get a random batch
        inputs, targets = inputs.cuda(), targets.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss_surface[i, j] = loss.item()

# Plotting the interactive loss surface using Plotly
alpha_grid, beta_grid = np.meshgrid(alpha_range, beta_range)

fig = go.Figure(data=[go.Surface(z=loss_surface, x=alpha_grid, y=beta_grid, colorscale='Viridis')])

# Add labels and title
fig.update_layout(
    title="Interactive Loss Surface Landscape (DenseNet)",
    scene=dict(
        xaxis_title='Direction 1',
        yaxis_title='Direction 2',
        zaxis_title='Loss'
    ),
    autosize=False,
    width=800,
    height=800
)

# Show the interactive plot
fig.show()


Files already downloaded and verified
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,728
       BatchNorm2d-2           [-1, 64, 32, 32]             128
            Conv2d-3          [-1, 128, 32, 32]           8,192
       BatchNorm2d-4          [-1, 128, 32, 32]             256
            Conv2d-5           [-1, 32, 32, 32]          36,864
        Bottleneck-6           [-1, 96, 32, 32]               0
       BatchNorm2d-7           [-1, 96, 32, 32]             192
            Conv2d-8          [-1, 128, 32, 32]          12,288
       BatchNorm2d-9          [-1, 128, 32, 32]             256
           Conv2d-10           [-1, 32, 32, 32]          36,864
       Bottleneck-11          [-1, 128, 32, 32]               0
      BatchNorm2d-12          [-1, 128, 32, 32]             256
           Conv2d-13          [-1, 128, 32, 32]          16,384
 

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 13.06 MiB is free. Process 5230 has 14.73 GiB memory in use. Of the allocated memory 14.19 GiB is allocated by PyTorch, and 416.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)