In [7]:
import numpy as np
import keras
import tensorflow as tf
from keras import layers
from keras import models
from keras.datasets import mnist

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.optim.lr_scheduler import StepLR

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

import sys
import math

import numpy as np
import math

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
class CustomDataLoader:
    def __init__(self, features, labels, batch_size=1, validation_size=0.0, shuffle=False):

        if validation_size > 0:
            train_data, val_data, train_labels, val_labels = train_test_split(features, labels, test_size=validation_size, stratify=labels, random_state=42)
            train_data_tensor = torch.tensor(train_data).float().to(device)
            train_labels_tensor = torch.tensor(train_labels).long().to(device)
            val_data_tensor = torch.tensor(val_data).float().to(device)
            val_labels_tensor = torch.tensor(val_labels).long().to(device)
    
            train_dataset = TensorDataset(train_data_tensor, train_labels_tensor)
            val_dataset = TensorDataset(val_data_tensor, val_labels_tensor)

            self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
            self.val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle)
        else:
            features_tensor = torch.tensor(features).float().to(device)
            labels_tensor = torch.tensor(labels).long().to(device)

            dataset = TensorDataset(features_tensor, labels_tensor)

            self.train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
            self.val_loader = None

    def get_train_loader(self):
        return self.train_loader
    
    def get_val_loader(self):
        return self.val_loader

In [219]:
def evaluate_model(model, custom_train_loader, criterion, optimizer):
    num_epochs = 20
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in custom_train_loader.get_train_loader():        
            optimizer.zero_grad()
            outputs = model(inputs.view(-1, 1, 28, 28))
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {running_loss / len(custom_train_loader.get_train_loader())}')

        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in custom_train_loader.get_val_loader():
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs.view(-1, 1, 28, 28))
                val_loss = criterion(outputs, labels)
                running_val_loss += val_loss.item()

            avg_val_loss = running_val_loss / len(custom_train_loader.get_val_loader())
            print(f'Validation Loss: {avg_val_loss}')
            print()

    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in custom_train_loader.get_val_loader():
            outputs = model(inputs.view(-1, 1, 28, 28))
            val_loss = criterion(outputs, labels)
            running_val_loss += val_loss.item()

    avg_val_loss = running_val_loss / len(custom_train_loader.get_val_loader())
    print(f'Validation Loss: {avg_val_loss}')

In [39]:
def custom_activation(x, b, eps):
    b_sign = b.sign()
    x_sign = x.sign()
    abs_b = b.abs()
    abs_x = x.abs()
    
    return b_sign * x_sign * (torch.min(torch.zeros_like(x), abs_x - abs_b) + abs_b)
#     max_part = torch.max(torch.zeros_like(x), -abs_x + abs_b) - abs_b

#     output = b_sign * x_sign * max_part
#     return output
#     return (torch.log1p(torch.exp(-torch.abs(x) / (torch.abs(b) + eps))) - 0.5 * math.log(2.0)) * (torch.abs(b) + eps) + torch.maximum(torch.tensor(0.0), x)

class CustomActivationLayer(nn.Module):
    def __init__(self, num_channels, beta=0.0):
        super(CustomActivationLayer, self).__init__()

        self.eps = sys.float_info.epsilon
        
        if beta == 0.0:
            beta = self.eps
        self.betas = nn.Parameter(torch.full((1, num_channels, 1, 1), self.eps))

    def forward(self, inputs):
        return custom_activation(inputs, self.betas, self.s)

class CustomActivationLayerTwo(nn.Module):
    def __init__(self, num_features, beta=0.0):
        super(CustomActivationLayerTwo, self).__init__()
        
        self.eps = sys.float_info.epsilon
        
        if beta == 0.0:
            beta = self.eps

        self.betas = nn.Parameter(torch.full((num_features,), self.eps))

    def forward(self, x):
        return custom_activation(x, self.betas, self.eps)

In [213]:
import torch

class CustomActivation(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, b):
        abs_b = b.abs()
        ctx.save_for_backward(x, abs_b)
        
        return b.sign() * x.sign() * torch.min(abs_b, x.abs())
        
    @staticmethod
    def backward(ctx, grad_output):
        x, abs_b = ctx.saved_tensors
        mask = x.abs() < abs_b

        grad_x = mask * grad_output
        grad_b = ~mask * grad_output * (x >= 0)

        return grad_x, grad_b
    
class CustomActivationLayer(nn.Module):
    def __init__(self, num_channels):
        super(CustomActivationLayer, self).__init__()
        self.eps = sys.float_info.epsilon
        self.betas = nn.Parameter(torch.full((1, num_channels, 1, 1), 0.0))

    def forward(self, x):
        return CustomActivation.apply(x, self.betas) 

class CustomActivationLayerTwo(nn.Module):
    def __init__(self, num_features, beta=0.0):
        super(CustomActivationLayerTwo, self).__init__()
        self.eps = sys.float_info.epsilon
        self.betas = nn.Parameter(torch.full((num_features,), 0.0))

    def forward(self, x):
        return CustomActivation.apply(x, self.betas) 

In [56]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [57]:
X_train_flat = X_train.reshape(-1, X_train.shape[1] * X_train.shape[2])

mean = np.mean(X_train_flat)
std = np.std(X_train_flat)

X_scaled = (X_train_flat - mean) / std

In [58]:
custom_train_loader = CustomDataLoader(X_scaled, y_train, batch_size=1024, validation_size=0.2)

In [229]:
class MNIST_CNN(nn.Module):
    def __init__(self, activation='relu'):
        super(MNIST_CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32 * 2, kernel_size=3, stride=1, padding=1)        
        self.conv2 = nn.Conv2d(32 * 2, 64, kernel_size=3, stride=1, padding=1)
        
        self.fc1 = nn.Linear(64 * 7 * 7, 64 * 7 * 7)
        self.fc2 = nn.Linear(64 * 7 * 7, 10)

        self.pool = nn.AvgPool2d(2, 2)
        self.flatten = nn.Flatten()

        self.relu = nn.ReLU()
        self.gelu = nn.GELU()

        if activation == 'relu':
            self.activation1 = nn.ReLU()
            self.activation2 = nn.ReLU()
            self.activation3 = nn.ReLU()
        elif activation == 'custom':
            self.activation1 = CustomActivationLayer(32 * 2)
            self.activation2 = CustomActivationLayer(64)
            self.activation3 = CustomActivationLayerTwo(64 * 7 * 7)

        nn.init.kaiming_normal_(self.conv1.weight, a=3, mode='fan_out', nonlinearity='leaky_relu')
        nn.init.kaiming_normal_(self.conv2.weight, a=3, mode='fan_out', nonlinearity='leaky_relu')
        nn.init.kaiming_normal_(self.fc1.weight, a=3, mode='fan_out', nonlinearity='leaky_relu')
        nn.init.kaiming_normal_(self.fc2.weight, a=3, mode='fan_out', nonlinearity='leaky_relu')

        self.conv1.bias.data.zero_()
        self.conv2.bias.data.zero_()
        self.fc1.bias.data.zero_()
        self.fc2.bias.data.zero_()

    def forward(self, x):
        x = self.pool(self.activation1(self.conv1(x)))
        x = self.pool(self.activation2(self.conv2(x)))
        
        x = self.flatten(x)
        x = self.activation3(self.fc1(x))
        x = self.fc2(x)
        return x
        
model = MNIST_CNN().to(device)
summary(model, input_size=(1, 1, 28, 28))

Layer (type:depth-idx)                   Output Shape              Param #
MNIST_CNN                                [1, 10]                   --
├─Conv2d: 1-1                            [1, 64, 28, 28]           640
├─ReLU: 1-2                              [1, 64, 28, 28]           --
├─AvgPool2d: 1-3                         [1, 64, 14, 14]           --
├─Conv2d: 1-4                            [1, 64, 14, 14]           36,928
├─ReLU: 1-5                              [1, 64, 14, 14]           --
├─AvgPool2d: 1-6                         [1, 64, 7, 7]             --
├─Flatten: 1-7                           [1, 3136]                 --
├─Linear: 1-8                            [1, 3136]                 9,837,632
├─ReLU: 1-9                              [1, 3136]                 --
├─Linear: 1-10                           [1, 10]                   31,370
Total params: 9,906,570
Trainable params: 9,906,570
Non-trainable params: 0
Total mult-adds (M): 17.61
Input size (MB): 0.00
Forward/backwa

In [27]:
model_relu = MNIST_CNN(activation='relu').to(device)

In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_relu.parameters(), lr=0.00001)
evaluate_model(model_relu, custom_train_loader, criterion, optimizer)

Epoch 1, Loss: 1.6637614952757003
Validation Loss: 0.6915233731269836

Epoch 2, Loss: 0.5074865938501155
Validation Loss: 0.38196735580762226

Epoch 3, Loss: 0.32931756402583834
Validation Loss: 0.2818109194437663

Epoch 4, Loss: 0.25568879316461846
Validation Loss: 0.2300460177163283

Epoch 5, Loss: 0.21273544620960316
Validation Loss: 0.19712168350815773

Epoch 6, Loss: 0.18333101684742786
Validation Loss: 0.17346159120400748

Epoch 7, Loss: 0.16106986872693327
Validation Loss: 0.15510118876894316

Epoch 8, Loss: 0.14327432357884468
Validation Loss: 0.14028940287729105

Epoch 9, Loss: 0.12875452780343116
Validation Loss: 0.12827937304973602

Epoch 10, Loss: 0.11677494407334227
Validation Loss: 0.11817524209618568

Epoch 11, Loss: 0.10672035464581023
Validation Loss: 0.10978241699437301

Epoch 12, Loss: 0.09815661276274538
Validation Loss: 0.10265752797325452

Epoch 13, Loss: 0.09079545577789874
Validation Loss: 0.09649818949401379

Epoch 14, Loss: 0.08440040701881368
Validation Loss:

In [230]:
model_custom = MNIST_CNN(activation='custom').to(device)

In [231]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_custom.parameters(), lr=0.0005)
evaluate_model(model_custom, custom_train_loader, criterion, optimizer)

Epoch 1, Loss: 1.5491448209640828
Validation Loss: 0.4516686523954074

Epoch 2, Loss: 0.2828554561797609
Validation Loss: 0.19018733128905296

Epoch 3, Loss: 0.14865569278914878
Validation Loss: 0.11441567974785964

Epoch 4, Loss: 0.09192678110396608
Validation Loss: 0.0814644576360782

Epoch 5, Loss: 0.06613621947930214
Validation Loss: 0.06503917121638854

Epoch 6, Loss: 0.05207185225283846
Validation Loss: 0.05632945646842321

Epoch 7, Loss: 0.043261564950993724
Validation Loss: 0.05199352651834488

Epoch 8, Loss: 0.03691664972203843
Validation Loss: 0.04919694301982721

Epoch 9, Loss: 0.03211231422709658
Validation Loss: 0.04685664394249519

Epoch 10, Loss: 0.028802518911184148
Validation Loss: 0.04330325545743108

Epoch 11, Loss: 0.02544724546927721
Validation Loss: 0.04137865034863353

Epoch 12, Loss: 0.023108883523085016
Validation Loss: 0.039659743352482714

Epoch 13, Loss: 0.021010462075788924
Validation Loss: 0.03960475868855914

Epoch 14, Loss: 0.017823115328049408
Validatio

In [232]:
# when a = 3
for name, param in model_custom.named_parameters():
    print(f"Parameter name: {name}")
#     print(f"Parameter shape: {param.shape}")
#     print(f"Parameter value: {param.data}")
    print(f"Parameter variance: {param.data.var()}")
    print()

Parameter name: conv1.weight
Parameter variance: 0.0006355533841997385

Parameter name: conv1.bias
Parameter variance: 0.00013336823030840605

Parameter name: conv2.weight
Parameter variance: 0.000985803548246622

Parameter name: conv2.bias
Parameter variance: 7.137360807973891e-05

Parameter name: fc1.weight
Parameter variance: 0.0002946029999293387

Parameter name: fc1.bias
Parameter variance: 1.2639691703952849e-05

Parameter name: fc2.weight
Parameter variance: 0.03812466561794281

Parameter name: fc2.bias
Parameter variance: 0.00013573089381679893

Parameter name: activation1.betas
Parameter variance: 0.0008865435374900699

Parameter name: activation2.betas
Parameter variance: 0.0008369925781153142

Parameter name: activation3.betas
Parameter variance: 0.00015496532432734966



In [227]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_custom.parameters(), lr=0.0005)
evaluate_model(model_custom, custom_train_loader, criterion, optimizer)

Epoch 1, Loss: 1.4818552326648793
Validation Loss: 0.40043382346630096

Epoch 2, Loss: 0.26688173666913456
Validation Loss: 0.18577498818437257

Epoch 3, Loss: 0.14871822947517355
Validation Loss: 0.11704253467420737

Epoch 4, Loss: 0.09296186109806628
Validation Loss: 0.08259222904841106

Epoch 5, Loss: 0.06553197446021627
Validation Loss: 0.06504188633213441

Epoch 6, Loss: 0.05112552547708471
Validation Loss: 0.05490450530002514

Epoch 7, Loss: 0.042206110393113276
Validation Loss: 0.05003295373171568

Epoch 8, Loss: 0.03493129283665342
Validation Loss: 0.047702200089891754

Epoch 9, Loss: 0.029734921344417208
Validation Loss: 0.04511323198676109

Epoch 10, Loss: 0.02677077589992513
Validation Loss: 0.04203771955023209

Epoch 11, Loss: 0.023735610748383592
Validation Loss: 0.039934143889695406

Epoch 12, Loss: 0.021402673696742414
Validation Loss: 0.03774016754080852

Epoch 13, Loss: 0.019188172936915084
Validation Loss: 0.03740476521973809

Epoch 14, Loss: 0.017679247271982914
Vali

In [228]:
# when a = 2
for name, param in model_custom.named_parameters():
    print(f"Parameter name: {name}")
#     print(f"Parameter shape: {param.shape}")
#     print(f"Parameter value: {param.data}")
    print(f"Parameter variance: {param.data.var()}")
    print()

Parameter name: conv1.weight
Parameter variance: 0.0009577867458574474

Parameter name: conv1.bias
Parameter variance: 0.0002525431918911636

Parameter name: conv2.weight
Parameter variance: 0.001273691770620644

Parameter name: conv2.bias
Parameter variance: 9.694890468381345e-05

Parameter name: fc1.weight
Parameter variance: 0.00034902256447821856

Parameter name: fc1.bias
Parameter variance: 9.303772458224557e-06

Parameter name: fc2.weight
Parameter variance: 0.06146855652332306

Parameter name: fc2.bias
Parameter variance: 7.941333024064079e-05

Parameter name: activation1.betas
Parameter variance: 0.0008800862706266344

Parameter name: activation2.betas
Parameter variance: 0.0004912934964522719

Parameter name: activation3.betas
Parameter variance: 0.00019973397138528526



In [222]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_custom.parameters(), lr=0.0005)
evaluate_model(model_custom, custom_train_loader, criterion, optimizer)

Epoch 1, Loss: 1.205354855415669
Validation Loss: 0.31648394217093784

Epoch 2, Loss: 0.237759567955707
Validation Loss: 0.17504819855093956

Epoch 3, Loss: 0.14309636170559742
Validation Loss: 0.11747782863676548

Epoch 4, Loss: 0.09642025035746554
Validation Loss: 0.08785507331291835

Epoch 5, Loss: 0.07011663105259551
Validation Loss: 0.07257502463956673

Epoch 6, Loss: 0.05520873635690263
Validation Loss: 0.061878583393990993

Epoch 7, Loss: 0.04611251447746094
Validation Loss: 0.05678046712030967

Epoch 8, Loss: 0.03884015676188976
Validation Loss: 0.05142528905222813

Epoch 9, Loss: 0.03376430716920406
Validation Loss: 0.048939445366462074

Epoch 10, Loss: 0.0297009473309872
Validation Loss: 0.048226588095227875

Epoch 11, Loss: 0.026926996067483375
Validation Loss: 0.04502180560181538

Epoch 12, Loss: 0.024028027132946127
Validation Loss: 0.04497269929076234

Epoch 13, Loss: 0.02056928537786007
Validation Loss: 0.04279967304319143

Epoch 14, Loss: 0.017948913070908252
Validation

In [224]:
# when a = 1
for name, param in model_custom.named_parameters():
    print(f"Parameter name: {name}")
#     print(f"Parameter shape: {param.shape}")
#     print(f"Parameter value: {param.data}")
    print(f"Parameter variance: {param.data.var()}")
    print()

Parameter name: conv1.weight
Parameter variance: 0.002005618531256914

Parameter name: conv1.bias
Parameter variance: 0.00025421546888537705

Parameter name: conv2.weight
Parameter variance: 0.0022366989869624376

Parameter name: conv2.bias
Parameter variance: 0.0001088862627511844

Parameter name: fc1.weight
Parameter variance: 0.0004937440971843898

Parameter name: fc1.bias
Parameter variance: 9.684428732725792e-06

Parameter name: fc2.weight
Parameter variance: 0.12768074870109558

Parameter name: fc2.bias
Parameter variance: 4.89032972836867e-05

Parameter name: activation1.betas
Parameter variance: 0.0007480555213987827

Parameter name: activation2.betas
Parameter variance: 0.0006396957323886454

Parameter name: activation3.betas
Parameter variance: 0.0001468024856876582

