In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# <ins>Activations:<ins>

### <ins>Sigmoid:<ins>
$ \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)} $

### <ins>Softmax:<ins>
$ \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)} $

# <ins>Loss Functions:<ins>

### <ins>Binary Cross-Entropy (BCE) Loss:<ins>
$ \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right] $

### <ins>Cross-Entropy Loss:<ins>
Softmax function is often used to convert the raw model outputs (logits) into probabilities. Cross-Entropy Loss measures the dissimilarity between this predicted probability distribution and the true distribution (one-hot encoded labels).

$ \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
          l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
          \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\} $

#### <ins>Example (Cross-Entropy Loss):<ins>

Let the true labels $y$ be one-hot encoded as follows:

$ y = \begin{bmatrix} 1 & 0 & 0 \\ 0 & 1 & 0 \end{bmatrix} \quad \text{(one-hot encoded labels)} $

Assume the model predictions $x$ are as follows:

$ x = \begin{bmatrix} 1.5 & -0.8 & 0.3 \\ -0.4 & 2.1 & -1.0 \end{bmatrix} \quad \text{(logits)} $

Now, the cross-entropy loss for each sample is given by:

$ L = \begin{bmatrix} l_1 \\ l_2 \end{bmatrix} $

$ l_1 = -\sum_{c=1}^C y_{1,c} \log\left(\frac{\exp(x_{1,c})}{\sum_{k=1}^C \exp(x_{1,k})}\right) \approx -y_{1,1} \log\left(\frac{\exp(1.5)}{\exp(1.5) + \exp(-0.8) + \exp(0.3)}\right) \approx 0.48 $

$ l_2 = -\sum_{c=1}^C y_{2,c} \log\left(\frac{\exp(x_{2,c})}{\sum_{k=1}^C \exp(x_{2,k})}\right) \approx -y_{2,2} \log\left(\frac{\exp(2.1)}{\exp(-0.4) + \exp(2.1) + \exp(-1.0)}\right) \approx 0.15 $

So, $ L = \begin{bmatrix} 0.48 \\ 0.15 \end{bmatrix} $ would be the vector of losses for this example.

### <ins>Mean-Squared Error (MSE) Loss:<ins>
$ \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
        l_n = \left( x_n - y_n \right)^2 $

# <ins>Definitions:<ins>

### <ins>Logits:<ins>
$ \text{logit}_i = \log\left(\frac{p_i}{1 - \sum_{j=1}^{K-1} p_j}\right) $,
all classes except the $i$-th class in denominator

### <ins>Sigmoid Probability:</ins>
$ p = \frac{1}{1 + \exp(-\text{logit})} $


### <ins>Softmax Probability:<ins>
$ p_i = \frac{\exp(\text{logit}_i)}{\sum_{j=1}^{K} \exp(\text{logit}_j)} $,
all classes including the $i$-th class in denominator

In [None]:
class UniversalPerceptron(nn.Module):
    def __init__(self, input_size, output_size, task_type="binary_classification"):
        super(UniversalPerceptron, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

        if task_type == "binary_classification":
            self.activation = nn.Sigmoid()
            self.loss_function = nn.BCELoss()
        elif task_type == "multi_class_classification":
            self.activation = nn.Softmax(dim=1)
            self.loss_function = nn.CrossEntropyLoss()
        elif task_type == "regression":
            self.activation = nn.Identity()
            self.loss_function = nn.MSELoss()
        else:
            raise ValueError(
                "Invalid task_type. Supported types are 'binary_classification', 'multi_class_classification', and 'regression'."
            )

    def forward(self, x):
        x = self.fc(x)
        x = self.activation(x)
        return x


class UniversalPerceptronTrainer:
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer

    def train(self, inputs, labels, epochs=100):
        for epoch in range(epochs):
            # zero out the gradients of all trainable parameters before computing the gradients
            # for the current batch. backward() method accumulates gradients by default, so 
            # calling zero_grad() ensures that the gradients are reset for each iteration
            self.optimizer.zero_grad()

            outputs = self.model(inputs)
            loss = self.model.loss_function(outputs, labels)

            # computes the gradients of the loss with respect to the parameters (weights and 
            # biases) using backpropagation. The gradients are stored in the grad attribute of 
            # each parameter
            loss.backward()

            # update the model's parameters based on the computed gradients
            self.optimizer.step()

            if (epoch + 1) % 10 == 0:
                print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

    def predict(self, inputs):
        # context manager that temporarily disables gradient computation. When inside this block, 
        # PyTorch does not track operations for gradient computation
        with torch.no_grad():
            outputs = self.model(inputs)
            if isinstance(self.model.loss_function, nn.BCELoss):
                # class with sigmoid probability >= 0.5
                predictions = (outputs >= 0.5).float()
            elif isinstance(self.model.loss_function, nn.CrossEntropyLoss):
                # class with highest softmax probability
                predictions = torch.argmax(outputs, dim=1)
            else:
                predictions = outputs
        return predictions

In [None]:
# Example usage
if __name__ == "__main__":
    # Generate some random training data
    np.random.seed(42)
    input_size = 2
    data_size = 100
    inputs = torch.tensor(np.random.rand(data_size, input_size), dtype=torch.float32)
    labels_binary_cls = torch.tensor(
        np.random.randint(2, size=(data_size, 1)), dtype=torch.float32
    )
    labels_multi_cls = torch.tensor(
        np.random.randint(3, size=(data_size)), dtype=torch.long
    )
    # In Pytorch, you can directly also use integer-encoded labels with nn.CrossEntropyLoss
    labels_multi_cls = F.one_hot(
        torch.tensor(np.random.randint(3, size=(data_size)), dtype=torch.long),
        num_classes=3,
    ).float()
    labels_regression = torch.tensor(np.random.rand(data_size, 1), dtype=torch.float32)

    # Create a UniversalPerceptron model for binary classification
    binary_cls_model = UniversalPerceptron(
        input_size, 1, task_type="binary_classification"
    )

    # Define optimizer
    binary_cls_optimizer = optim.SGD(binary_cls_model.parameters(), lr=0.01)

    # Create a UniversalPerceptronTrainer instance for binary classification
    binary_cls_trainer = UniversalPerceptronTrainer(
        binary_cls_model, binary_cls_optimizer
    )

    # Train the model for binary classification
    binary_cls_trainer.train(inputs, labels_binary_cls, epochs=100)

    # Test the trained model with new data for binary classification
    test_inputs = torch.tensor(np.random.rand(5, input_size), dtype=torch.float32)
    binary_cls_predictions = binary_cls_trainer.predict(test_inputs)
    print("Binary Classification Predictions:", binary_cls_predictions)

    # Create a UniversalPerceptron model for multi-class classification
    multi_cls_model = UniversalPerceptron(
        input_size, 3, task_type="multi_class_classification"
    )

    # Define optimizer
    multi_cls_optimizer = optim.SGD(multi_cls_model.parameters(), lr=0.01)

    # Create a UniversalPerceptronTrainer instance for multi-class classification
    multi_cls_trainer = UniversalPerceptronTrainer(multi_cls_model, multi_cls_optimizer)

    # Train the model for multi-class classification
    multi_cls_trainer.train(inputs, labels_multi_cls, epochs=100)

    # Test the trained model with new data for multi-class classification
    multi_cls_predictions = multi_cls_trainer.predict(test_inputs)
    print("Multi-Class Classification Predictions:", multi_cls_predictions)

    # Create a UniversalPerceptron model for regression
    regression_model = UniversalPerceptron(input_size, 1, task_type="regression")

    # Define optimizer
    regression_optimizer = optim.SGD(regression_model.parameters(), lr=0.01)

    # Create a UniversalPerceptronTrainer instance for regression
    regression_trainer = UniversalPerceptronTrainer(
        regression_model, regression_optimizer
    )

    # Train the model for regression
    regression_trainer.train(inputs, labels_regression, epochs=100)

    # Test the trained model with new data for regression
    regression_predictions = regression_trainer.predict(test_inputs)
    print("Regression Predictions:", regression_predictions)