This notebook introduces the Parametric Rectified Power Unit (PRePU), which offers superior polynomial approximation capabillity compared to RELU

In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torchinfo import summary
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

This custom activation function introduces a trainable parameter, scale, and applies ReLU on the inputs, to introduce non-linearity, then softsign to squash the scale to a range between 0 and 2 to keep exploding gradients in check. However, since we are using the power function, aka doing x^a, this introduces unbounded derivative on the activation function and more exploding gradient issues. Techniques for keeping exploding gradients in check include gradient clipping (clipping-by-norm or clipping-by-value), lower learning rate, learning rate warmup, changing weight initialization technique or lower weight intialization variance, batch normalization, using the activation function only for a few layers to learn polynomial features, 

In [2]:
def custom_activation(x, scale):
    x_relu = torch.relu(x)
    power = F.softsign(scale) + 1
    return torch.pow(x_relu, power)

class CustomActivationLayer(nn.Module):
    def __init__(self, num_features):
        super(CustomActivationLayer, self).__init__()
        self.scale = nn.Parameter(torch.zeros(num_features))

    def forward(self, inputs):
        return custom_activation(inputs, self.scale)

Toy dataset

In [3]:
def polynomial(x):
    return 1 + 0.5 * x ** 1/2 + 2 * x**2 - 3 * x**3

num_points = 10000
x_values_normal = np.random.normal(0, 2, num_points)

y_values_normal = polynomial(x_values_normal)

dataset_normal = np.column_stack((x_values_normal, y_values_normal))
dataset_normal[:10]

array([[ 2.29269132e-01,  1.12629181e+00],
       [-3.17074082e-01,  1.21743548e+00],
       [-6.36190302e-01,  2.42289999e+00],
       [-5.19254029e+00,  4.73638021e+02],
       [-1.32885853e+00,  1.12392697e+01],
       [-5.65352760e-01,  2.04000979e+00],
       [ 1.96689229e+00, -1.35986902e+01],
       [-1.10441764e+00,  7.20467383e+00],
       [-2.80786065e-01,  1.15389732e+00],
       [-1.59772708e-01,  1.02334717e+00]])

Creating a feed forward nn with ReLU as activation function (241 trainable parameters) and another feed forward nn with PRePU as activation function (273 trainable parameters). 

In [4]:
class FeedFowardReLU(nn.Module):
    def __init__(self):
        super(FeedFowardReLU, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(1, 8),
            nn.ReLU(),
            nn.Linear(8, 8),
            nn.ReLU(),
            nn.Linear(8, 8),
            nn.ReLU(),
            nn.Linear(8, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )

    def forward(self, x):
        return self.layers(x)
    
model_relu = FeedFowardReLU()
summary(model_relu, input_size=(1, 1))

Layer (type:depth-idx)                   Output Shape              Param #
FeedFowardReLU                           [1, 1]                    --
├─Sequential: 1-1                        [1, 1]                    --
│    └─Linear: 2-1                       [1, 8]                    16
│    └─ReLU: 2-2                         [1, 8]                    --
│    └─Linear: 2-3                       [1, 8]                    72
│    └─ReLU: 2-4                         [1, 8]                    --
│    └─Linear: 2-5                       [1, 8]                    72
│    └─ReLU: 2-6                         [1, 8]                    --
│    └─Linear: 2-7                       [1, 8]                    72
│    └─ReLU: 2-8                         [1, 8]                    --
│    └─Linear: 2-9                       [1, 1]                    9
Total params: 241
Trainable params: 241
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params

In [5]:
class FeedFowardPRePU(nn.Module):
    def __init__(self):
        super(FeedFowardPRePU, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(1, 8),
            CustomActivationLayer(8),
            nn.Linear(8, 8),
            CustomActivationLayer(8),
            nn.Linear(8, 8),
            CustomActivationLayer(8),
            nn.Linear(8, 8),
            CustomActivationLayer(8),
            nn.Linear(8, 1)
        )

    def forward(self, x):
        return self.layers(x)
    
model_prepu = FeedFowardPRePU()
summary(model_prepu, input_size=(1, 1))

Layer (type:depth-idx)                   Output Shape              Param #
FeedFowardPRePU                          [1, 1]                    --
├─Sequential: 1-1                        [1, 1]                    --
│    └─Linear: 2-1                       [1, 8]                    16
│    └─CustomActivationLayer: 2-2        [1, 8]                    8
│    └─Linear: 2-3                       [1, 8]                    72
│    └─CustomActivationLayer: 2-4        [1, 8]                    8
│    └─Linear: 2-5                       [1, 8]                    72
│    └─CustomActivationLayer: 2-6        [1, 8]                    8
│    └─Linear: 2-7                       [1, 8]                    72
│    └─CustomActivationLayer: 2-8        [1, 8]                    8
│    └─Linear: 2-9                       [1, 1]                    9
Total params: 273
Trainable params: 273
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params siz

In [6]:
class CustomDataLoader:
    def __init__(self, features, labels, batch_size=1, validation_size=0.0, shuffle=False):

        if validation_size > 0:
            train_data, val_data, train_labels, val_labels = train_test_split(
                features, labels, test_size=validation_size, random_state=42
            )
            self.train_loader = DataLoader(
                TensorDataset(torch.tensor(train_data).float(), torch.tensor(train_labels).float()),
                batch_size=batch_size,
                shuffle=shuffle,
            )
            self.val_loader = DataLoader(
                TensorDataset(torch.tensor(val_data).float(), torch.tensor(val_labels).float()), batch_size=batch_size, shuffle=shuffle
            )
        else:
            self.train_loader = DataLoader(
                TensorDataset(torch.tensor(features).float(), torch.tensor(labels).float()), batch_size=batch_size, shuffle=shuffle
            )
            self.val_loader = None

    def get_train_loader(self):
        return self.train_loader

    def get_val_loader(self):
        return self.val_loader

def evaluate_model(model, custom_train_loader, criterion, optimizer):
    num_epochs = 100
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in custom_train_loader.get_train_loader():
            optimizer.zero_grad()
            outputs = model(inputs.view(-1, 1))
            loss = criterion(outputs, labels.view(-1, 1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=100)
            # torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=100)

            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(custom_train_loader.get_train_loader())

        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in custom_train_loader.get_val_loader():
                outputs = model(inputs.view(-1, 1))
                val_loss = criterion(outputs, labels.view(-1, 1))
                running_val_loss += val_loss.item()

        avg_val_loss = running_val_loss / len(custom_train_loader.get_val_loader())
        print(f"Epoch {epoch+1:4d} | Train Loss: {avg_train_loss:10.4f} | Validation Loss: {avg_val_loss:10.4f}")

custom_train_loader = CustomDataLoader(x_values_normal, y_values_normal, batch_size=6 * 6, validation_size=0.2, shuffle=True)

<span style="font-size:smaller;">
<div>PRePU activation shows vastly superior ability to approximate polynomial compared to ReLU.</div> 
<div>This is potentially useful for time series data, which tends to follow polynomial structure, as well as other types of data.</div>

<b>Model with ReLU activation:</b><br>
Epoch  100 | Train Loss:  2775.5210 | Validation Loss:  2845.0250<br>

<b>Model with PRePU activation:</b><br>
Epoch  100 | Train Loss:     0.2723 | Validation Loss:     0.2764
</span>

In [7]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model_relu.parameters())
evaluate_model(model_relu, custom_train_loader, criterion, optimizer)

Epoch    1 | Train Loss:  9832.7991 | Validation Loss:  8405.0763
Epoch    2 | Train Loss:  7785.5992 | Validation Loss:  5194.0731
Epoch    3 | Train Loss:  5155.2594 | Validation Loss:  4136.2041
Epoch    4 | Train Loss:  4543.8516 | Validation Loss:  3713.4670
Epoch    5 | Train Loss:  4899.1888 | Validation Loss:  3199.1072
Epoch    6 | Train Loss:  3590.4449 | Validation Loss:  2831.2832
Epoch    7 | Train Loss:  3271.0715 | Validation Loss:  2466.1872
Epoch    8 | Train Loss:  2847.0595 | Validation Loss:  2172.3354
Epoch    9 | Train Loss:  2433.1323 | Validation Loss:  1769.5347
Epoch   10 | Train Loss:  2129.7823 | Validation Loss:  1398.4995
Epoch   11 | Train Loss:  1824.2780 | Validation Loss:  1229.6812
Epoch   12 | Train Loss:  1587.8751 | Validation Loss:  1004.9767
Epoch   13 | Train Loss:  1369.7085 | Validation Loss:   854.8282
Epoch   14 | Train Loss:  1174.7227 | Validation Loss:   738.9935
Epoch   15 | Train Loss:  1048.0580 | Validation Loss:   590.1040
Epoch   16

In [8]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model_prepu.parameters())
evaluate_model(model_prepu, custom_train_loader, criterion, optimizer)

Epoch    1 | Train Loss:  9947.8263 | Validation Loss:  8573.0011
Epoch    2 | Train Loss:  4872.8469 | Validation Loss:  3786.1320
Epoch    3 | Train Loss:  3302.2997 | Validation Loss:  1102.5246
Epoch    4 | Train Loss:   474.7575 | Validation Loss:   232.0756
Epoch    5 | Train Loss:   163.7916 | Validation Loss:     5.5412
Epoch    6 | Train Loss:     9.3231 | Validation Loss:     4.7504
Epoch    7 | Train Loss:     7.7992 | Validation Loss:     2.7427
Epoch    8 | Train Loss:     5.7084 | Validation Loss:     2.1191
Epoch    9 | Train Loss:     3.9655 | Validation Loss:     1.2975
Epoch   10 | Train Loss:     4.5550 | Validation Loss:     1.1546
Epoch   11 | Train Loss:     2.8098 | Validation Loss:     3.7431
Epoch   12 | Train Loss:     3.2621 | Validation Loss:     1.2849
Epoch   13 | Train Loss:     3.8715 | Validation Loss:     3.5378
Epoch   14 | Train Loss:     1.5759 | Validation Loss:     1.1625
Epoch   15 | Train Loss:     1.9998 | Validation Loss:     0.3459
Epoch   16

In [15]:
for name, param in model_prepu.named_parameters():
    if 'scale' in name.lower():
        print(f"Weights for {name}: {param.data}")
        print(f"Power for {name}: {F.softsign(param.data) + 1}")
        print()

Weights for layers.1.scale: tensor([0.4539, 0.0359, 0.0324, 0.5567, 0.4577, 0.4319, 0.8211, 0.0226])
Power for layers.1.scale: tensor([1.3122, 1.0346, 1.0314, 1.3576, 1.3140, 1.3016, 1.4509, 1.0221])

Weights for layers.3.scale: tensor([0.0088, 0.6225, 0.0000, 0.0000, 0.4588, 0.3729, 0.0000, 0.4976])
Power for layers.3.scale: tensor([1.0088, 1.3836, 1.0000, 1.0000, 1.3145, 1.2716, 1.0000, 1.3323])

Weights for layers.5.scale: tensor([ 0.3419,  0.0000,  0.3977, -0.8998,  0.4199,  0.0409,  0.2212,  0.4863])
Power for layers.5.scale: tensor([1.2548, 1.0000, 1.2846, 0.5264, 1.2957, 1.0393, 1.1812, 1.3272])

Weights for layers.7.scale: tensor([-0.3667,  0.3235, -0.1554,  0.5459,  0.4005,  0.0455,  0.3236, -0.6362])
Power for layers.7.scale: tensor([0.7317, 1.2444, 0.8655, 1.3531, 1.2859, 1.0436, 1.2445, 0.6112])



In [None]:
# TODO
# Read IEEE Improved Polynomial Neural Networks with Normalised Activations (can help give an idea on gradient instablity)
# Look up polynomial activation function
# Test on more realistic dataset
# Look up rectified power unit
# Read Why Rectified Power (RePU) Activation Functions Are Efficient in Deep Learning: A Theoretical Explanation
# Do more research on polynomial activation function or power activation function
# Read https://www.researchgate.net/publication/330295973_Piecewise_Polynomial_Activation_Functions_for_Feedforward_Neural_Networks
# Consensus ChatGPT "research on polynomial activation function or power activation function"
# http://sigtbd.csail.mit.edu/pubs/2018/sigtbd18-paper-4.pdf 