Here is an image of a neural network

![image](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*3fA77_mLNiJTSgZFhYnU0Q.png)

In practice, this diagram is misleading:
- We pass data typically in batches.
- Data is represented as a tensor
- The neural network itself is represented as a tensor of it’s weights.

Let's begin by importing libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from torch.utils.data import DataLoader, TensorDataset

Now import the iris dataset. More information about it [here](https://archive.ics.uci.edu/dataset/53/iris)

In [None]:
# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

In [None]:
X.shape, y.shape

((150, 4), (150,))

In [None]:
# Split the dataset into training and testing sets. Adding a random seed for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [None]:
X_train[0], y_train[0]

(array([6.1, 3. , 4.6, 1.4]), 1)

In [None]:
# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

In [None]:
X_train_tensor[0], X_train_tensor[0]

(tensor([6.1000, 3.0000, 4.6000, 1.4000]),
 tensor([6.1000, 3.0000, 4.6000, 1.4000]))

In [None]:
type(X_train), type(y_train), type(X_train_tensor), type(y_train_tensor)

(numpy.ndarray, numpy.ndarray, torch.Tensor, torch.Tensor)

We encapsulate these features and labels into Datasets and DataLoaders
- Dataset provides an abstraction for handling features + labels easily
- DataLoaders makes the dataset iterable and also allows batching and shuffling of data in the Dataset.

In [None]:
batch_size = 5 # passing in 5 X, y pairs through the network at a time for 1 parameter update of network
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
test_dataset

<torch.utils.data.dataset.TensorDataset at 0x7dfbbdc6cd30>

Define the neural network architecture

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [None]:
# Hyperparameters
input_size = 4  # Number of features in the Iris dataset
hidden_size = 6
num_classes = 3  # Iris dataset has 3 classes (setosa, versicolor, virginica)
learning_rate = 0.001 # How large should the jump in gradient be
num_epochs = 1_000 # Number of iterations over the dataset during the training process

# Create the neural network
model = NeuralNetwork(input_size, hidden_size, num_classes)
model

NeuralNetwork(
  (fc1): Linear(in_features=4, out_features=6, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=6, out_features=3, bias=True)
)

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss()
criterion

CrossEntropyLoss()

In [None]:
# Optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [None]:
# Training the model
for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward and optimize
        optimizer.zero_grad() # Clear gradients
        loss.backward() # Compute gradients
        optimizer.step() # Update parameters of Neural Network

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        print(batch_X.shape, batch_y.shape, loss)

Epoch [100/1000], Loss: 0.0798
torch.Size([5, 4]) torch.Size([5]) tensor(0.0798, grad_fn=<NllLossBackward0>)
Epoch [200/1000], Loss: 0.1266
torch.Size([5, 4]) torch.Size([5]) tensor(0.1266, grad_fn=<NllLossBackward0>)
Epoch [300/1000], Loss: 0.1273
torch.Size([5, 4]) torch.Size([5]) tensor(0.1273, grad_fn=<NllLossBackward0>)
Epoch [400/1000], Loss: 0.0239
torch.Size([5, 4]) torch.Size([5]) tensor(0.0239, grad_fn=<NllLossBackward0>)
Epoch [500/1000], Loss: 0.1591
torch.Size([5, 4]) torch.Size([5]) tensor(0.1591, grad_fn=<NllLossBackward0>)
Epoch [600/1000], Loss: 0.0026
torch.Size([5, 4]) torch.Size([5]) tensor(0.0026, grad_fn=<NllLossBackward0>)
Epoch [700/1000], Loss: 0.0779
torch.Size([5, 4]) torch.Size([5]) tensor(0.0779, grad_fn=<NllLossBackward0>)
Epoch [800/1000], Loss: 0.0032
torch.Size([5, 4]) torch.Size([5]) tensor(0.0032, grad_fn=<NllLossBackward0>)
Epoch [900/1000], Loss: 0.0005
torch.Size([5, 4]) torch.Size([5]) tensor(0.0005, grad_fn=<NllLossBackward0>)
Epoch [1000/1000], 

In [None]:
# Testing the model
with torch.no_grad():
    model.eval()
    correct = 0
    total = 0
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X) # batch_size x 3
        _, predicted = torch.max(outputs, 1) # Get the maximum of 3 values for every sample in batch.
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.2f}')

Test Accuracy: 1.00
