In [3]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as func
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

# Setting the Device

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cpu device


# Defining a Neural Network

In [70]:
# Create a class for a feedforward neural network with one hidden layer.
# fc1 (fully-connected layer)
# define each layer using nn.Linear as this is the linearity part of the model, the activation func is the bit that adds non-linearity

class SampleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_prob=0.2):
        super(SampleNN, self).__init__()
        self.fc1 = nn.Linear(in_features=input_size, out_features=hidden_size, bias=True)
        self.fc2 = nn.Linear(hidden_size, output_size, bias=True)
        self.dropout_prob = dropout_prob

    def forward(self, x):
        # layer 1
        x = self.fc1(x)
        x = func.relu(x) # hidden layer activation function
#         x = nn.Dropout(p=self.dropout_prob)
        # layer 2
        x = self.fc2(x)
#         x = func.sigmoid(x) # output layer activation function if required
        return x
    
    def fit(self, X, y, num_epochs=500, patience=10):
        for epoch in range(num_epochs):
            # Forward pass
            output = self.forward(X)

            # Compute the loss
            loss = loss_function(output, y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()

In [71]:
# Create an instance of the SampleNN class.
sample_model = SampleNN(5,3,1, dropout_prob=0.2)
sample_model

SampleNN(
  (fc1): Linear(in_features=5, out_features=3, bias=True)
  (fc2): Linear(in_features=3, out_features=1, bias=True)
)

#### Hyperparameters - initialisation

Here is where we define:
- The number of layers
- Number of neurons in each layer
- The activation functions of each layer
- Whether a bias term is added (to prevent overfitting)
- Dropout (on/off) and magnitude of dropout:
This 'turns off' neurons in order to prevent overfitting and introduce further non-linearity. This takes it from being fully-connected to not.

### Activation functions for hidden layers

#### ReLU (Rectified Linear Unit):
- Formula: f(x) = max(0,x)
- Pros: Fast to compute, reduces vanishing gradient problem, generally works well in practice for deep networks.
- Cons: Can suffer from "dead neurons" where some neurons never activate and therefore never update
<br>

#### Leaky ReLU:
- Formula: f(x) = x for x>0 and f(x) = ax for x<=0
- Pros: Addresses the "dead neuron" problem of ReLU by allowing a small gradient for negative values.

### Activation functions for output layers

#### Binary classification
- Neurons: 1
- Activation Function: Sigmoid (to squash output between 0 and 1, representing the probability of the positive class).
- Example: Email spam detection (spam or not spam).
<br>

#### Multi-class classification (single label)
- Neurons: Equal to the number of classes.
- Activation Function: Softmax (to output a probability distribution over multiple classes).
- Example: Handwritten digit recognition (0 to 9).
<br>

#### Multi-label classification
- Neurons: Equal to the number of classes.
- Activation Function: Sigmoid for each neuron (since each class prediction is treated as an independent binary classification).
- Example: Image tagging (an image can have multiple tags like "beach", "sunset", "people").
<br>

#### Regression
- Neurons: 1 (for single-output regression) or more (for multi-output regression).
- Activation Function: Usually linear or none. However, if the regression output has a known bounded range, activation functions like sigmoid or tanh might be used to bound the output.
- Example: Predicting house prices
<br>

#### Time-series forecasting
- Neurons: Can vary depending on the forecasting horizon (e.g., next value, next 10 values).
- Activation Function: Often linear, especially if the values can range widely. For bounded values, other activations might be used.
- Example: Stock price prediction for the next week
<br>

#### Autoencoders (for dimensionality reduction, denoising, etc.)
- Neurons: Varies based on the desired reduced dimensionality or the structure of the data.
- Activation Function: Could be linear, sigmoid, tanh, etc., depending on the nature of the data and the specific use-case.
- Example: Image denoising
<br>

#### Generative Models (like GANs)
- Neurons: Varies based on the structure and size of the data being generated.
- Activation Function: Can vary, but for image generation tasks, the tanh activation is commonly used for the generator's output layer.
- Example: Generating art images
<br>

#### Sequence-to-Sequence Problems (like translation)
- Neurons: Often depends on the vocabulary size of the target language (for tasks like translation) or other sequence length details.
- Activation Function: Softmax, especially when predicting tokens from a vocabulary.
- Example: Translating English to French

# Optimizing the Model

In [72]:
# Define a loss function (MSE loss) and an optimizer (SGD) for the model.
loss_function = nn.MSELoss()
optimizer = optim.SGD(sample_model.parameters(), lr=0.001, weight_decay=0.001, momentum=0.9) # here we add learning rate, momentum, dampening etc

#### Hyperparameters - optimisation

- Optimiser:
E.g. SGD, Adam
<br>

- Loss function:
E.g. MSE, cross-entropy
<br>

- Learning rate:
How quickly the model should descend, bearing in mind that a low learning rate will have a slow descent but will converge smoothly. (Although runs the risk of overfitting by tracking noise. This can be offset partially by introducing momentum). A high learning rate will be quick but may never converge or find a minimum loss, resulting in high bias.
<br>
- Momentum: 
Helps to find actual direction of gradient descent, smoothing out variations such as noise, saddle points or plateaus. Momentum is given as a fraction `(gamma/beta)` and used iteravely, updating with each epoch. The proportion of momentum to the `current gradient*learning rate` is given by the fraction. Using momentum helps propel the optimiser in the direction of previously accumulated gradients. <br>

`velocity = gamma * velocity + learning_rate * gradient` <br>
`new_parameters = old_parameters - velocity`
<br>
- Batch size:
What proportion of training data samples are used in SGD (remember S in SGD is stochastic, which means computing derivatives of a proportion of the data at random - less computationally expensive with minimal trade off in accuracy)
<br>

- Weight decay (on/off) and weight (magnitude of penalty):
L2 regularisation - This adds a penalty to the loss function to prevent overfitting - basically introducing a small bias

# Training the Model

In [73]:
# Create some sample data for training.
input_data = torch.rand(10000, 5)  # 10000 samples, each with 5 features
target_data = torch.rand(10000, 1)  # 10000 samples, each with 1 target

X_train, X_val_test, y_train, y_val_test = train_test_split(input_data, target_data, test_size=0.5)

# use fit module of SampleNN class to train
sample_model.fit(X_train, y_train, num_epochs=500, patience=10)

#### Hyperparameters

- Requires grad:
Initialising data points with a requires_grad hyperparameters to make sure they are optimisable during training using gradient descent
<br>
- Number of epochs:
The amount of forwards/backwards passes the model should make
<br>
- Early stopping rules:
If no improvement is made after x epochs (known as patience) of the same loss value, then stop training

# Evaluating the Model

In [74]:
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5)

# regression
score = mean_squared_error(y_val.detach().numpy(), sample_model(X_val).detach().numpy())
score

# # classification
# score = accuracy_score(y_val.detach().numpy(), sample_model(X_val).detach().numpy())

0.083524615

`Choose hyperparameters on the validation set`<br>
`Get final score on the test set`

#### Hyperparameters - Validating and Testing

- Cross-validation:
Should we cross-validate or not?
- Hyperparameter optimisation:
Could we use bayesian optimisation libraries to find the optimal hyperparameters?

# Make Predictions

In [75]:
# Create a new input tensor for prediction.
new_input = torch.rand(1, 5)

# Make a prediction using the trained model.
prediction = sample_model(new_input)

# Print the prediction.
print("Prediction:", prediction)

Prediction: tensor([[0.4907]], grad_fn=<AddmmBackward0>)


# Deployment

In [53]:
# Save the model
torch.save(sample_model.state_dict(), "model.pth")

In [54]:
# Loading the model
model = sample_model.to(device)
model.load_state_dict(torch.load("model.pth"))
model.eval()

SampleNN(
  (fc1): Linear(in_features=5, out_features=3, bias=True)
  (fc2): Linear(in_features=3, out_features=1, bias=True)
)

In [57]:
new_data = torch.rand(1, 5)  # 1 sample with 5 features

# Convert the data to a PyTorch tensor
input_tensor = torch.FloatTensor(new_data)

# If you used a GPU during training, move the input tensor to the same device
if torch.cuda.is_available():
    input_tensor = input_tensor.to('cuda')

# Get the model's predictions
with torch.no_grad():  # Needs to not be tracked with PyTorch autograd - don't want this sample to have impact on weights
    outputs = model(input_tensor)

print(new_data, outputs)

tensor([[0.5874, 0.1977, 0.0122, 0.9257, 0.5728]]) tensor([[0.5074]])
