In [None]:
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt




# Network configuration variables
input_size = 1
num_gaussians = 50
num_sigmoids = 0
hidden_sizes = [50, 50]  # 2 hidden layers
output_size = 1

In [None]:
class SimpleFNN(nn.Module):
    def __init__(self, num_gaussians, num_sigmoids, hidden_sizes, output_size, activation="relu"):
        super(SimpleFNN, self).__init__()
        self.num_gaussians = num_gaussians
        self.num_sigmoids = num_sigmoids
        self.means = nn.Parameter(torch.randn(num_gaussians)) # means for gaussians tensor
        self.origins = nn.Parameter(torch.randn(num_sigmoids)) # origins for sigmoids tensor
        self.log_sigmas = nn.Parameter(torch.randn(num_gaussians)/4) # to keep initial sigmas reasonable
        self.scales = nn.Parameter(torch.randn(num_sigmoids)/4) # to keep initial scales reasonable

        # --- build an arbitrary stack of Linear layers ----
        input_dim = num_gaussians + num_sigmoids
        sizes = [input_dim] + list(hidden_sizes)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)]
        )
        self.out = nn.Linear(sizes[-1], output_size)

        # activation    
        acts = {"relu": nn.ReLU(), "tanh": nn.Tanh(), "gelu": nn.GELU()}
            # ReLu (Rectified Linear Unit): f(x) = max(0, x) || ++ simple, efficient, works well for most networks || -- can die during training if too many neurons output 0
            # Tanh (Hyperbolic Tangent): f(x) = tanh(x) || outputs between -1 and 1, zero-centered || ++ good for smooth function approximation || -- can suffer from vanishing gradient problem
            # GELU (Gaussian Error Linear Unit): f(x) = x * Φ(x) || ++ smooth || -- computationally expansive
        self.act = acts[activation]

        self.init_weights()
        
    def init_weights(self):
        # init all hidden layers
        for layer in self.hidden_layers:
            nn.init.normal_(layer.weight, mean=0, std=1)
            nn.init.normal_(layer.bias,   mean=0, std=1)
            layer.weight.data.clamp_(-1, 1)

        # init output layer
        nn.init.normal_(self.out.weight, mean=0, std=1)
        nn.init.normal_(self.out.bias,   mean=0, std=1)
        self.out.weight.data.clamp_(-1, 1)

    def forward(self, x):
        x = x.view(-1, 1)

        # ---- Gaussian features ----
        if self.num_gaussians > 0:
            mu = self.means.view(1, -1)
            sigma = torch.exp(self.log_sigmas).view(1, -1)
            x_expanded_gaus = x.expand(-1, self.num_gaussians)
            gaussians = torch.exp(-0.5 * ((x_expanded_gaus - mu) / sigma) ** 2)
        else:
            gaussians = torch.empty(x.size(0), 0, device=x.device)

        # ---- Sigmoid features ----
        if self.num_sigmoids > 0:
            origins = self.origins.view(1, -1)
            scales  = torch.exp(self.scales).view(1, -1)
            x_expanded_sigm = x.expand(-1, self.num_sigmoids)
            sigmoids = 1.0 / (1.0 + torch.exp(-((x_expanded_sigm - origins) / scales)))
        else:
            sigmoids = torch.empty(x.size(0), 0, device=x.device)

        h = torch.cat([gaussians, sigmoids], dim=1) # initialize the internal state of the network

        # ---- arbitrary-depth MLP ----
        for layer in self.hidden_layers:
            h = self.act(layer(h)) # pass h through each hidden layer with activation (so non-linear)

        return self.out(h)

Exponential

In [None]:
neural_net = SimpleFNN(
    num_gaussians=1,
    num_sigmoids=0,
    hidden_sizes=[32, 32],   
    output_size=1,
    activation="relu",       # "relu" | "tanh" | "gelu"
)

x_eval = torch.linspace(-1, 1, 100).unsqueeze(1) # shape (100, 1)  [unsqueeze to add feature dimension]
optimizer = optim.Adam(neural_net.parameters(), lr=1e-3) # Adam optimizer at learning rate 0.001
criterion = nn.MSELoss() # Mean Squared Error loss  [how far your model’s predictions are from the target values]

def l1_of_all_weights(model):
    l1 = 0.0
    for m in model.modules():
        if isinstance(m, nn.Linear):
            l1 = l1 + m.weight.abs().sum()
    return l1

previous_loss = 100
for epoch in range(1000000):
    optimizer.zero_grad() # zero the parameter gradients
    y_pred = neural_net(x_eval)
    y_true = torch.exp(x_eval) # target function: y = e^x

    l1_norm = l1_of_all_weights(neural_net) # computes the sum of absolute values of all weights in all hidden layers
    loss = criterion(y_pred, y_true)# + torch.tensor(0.002)*l1_norm
    loss.backward() # backpropagation  [looks at how the loss changes nd calculates how each weight contributed to that loss]
    optimizer.step() # update weights based on gradients calculated during backpropagation

    if epoch % 10000 == 0:
        print(f"Epoch {epoch}, Loss {loss.item():.6f}")
        print("Amplitudes:", neural_net.out.weight.data)
        current_loss = loss
        previous_loss = loss