In [None]:
''' 
Aim: Solve the XOR problem using a deep feedforward network and
implement gradient-based learning.

XOR problem: A function that outputs 1 when one of the inputs is 1 and other is 0
XOR is a non-linearly separable problem means a simple linear model can't solve it.

Deep FeedForward network:
A feedForward network is a type of AI network where the data flows in one direction
It consists of an input layer, one or more hidden layers and output layer.
We need to use non-linear activation function like sigmoid or ReLU in hidden layer to capture the non-linearity

Gradient based learning (Backpropagation):
Goal is to minimize the loss function using gradient descent to adjust weights and biases.
Backpropagation computes gradients of the loss with respect to weights using the chain
rule and updates them iteratively.
'''

import torch
import torch.nn as nn #Helps in creating neural network layers
import torch.optim as optim #Optimizers to improve the model

# Define the XOR dataset
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
Y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)
#X : All possible combinations of 2 bits
#Y : XOR output

# Define the feedforward neural network
class XORNet(nn.Module): #nn.Module: Base class for all neural network
    def __init__(self):
        super(XORNet, self).__init__() #XORNet is a custome network class.
        self.hidden = nn.Linear(2, 4) # Hidden layer with 4 neurons
        #Takes two inputs X1, X2 and sends to 4 neurons
        self.output = nn.Linear(4, 1) # Output layer
        #Takes 4 inputs from hidden layer and gives 1 output
    def forward(self, x): #How input data flows through the network
        x = torch.sigmoid(self.hidden(x)) # Activation function
        #sigmoid : Converts output to a number between 0 and 1.
        x = torch.sigmoid(self.output(x)) # Activation function
        return x

# Initialize the model
model = XORNet()

# Define the loss function and optimizer
criterion = nn.BCELoss() # Binary Cross-Entropy Loss as the o/p is binary
#It measures the error between predicted probability and actual label.
optimizer = optim.Adam(model.parameters(), lr=0.1) 
# Adam optimizer : Adjusts learning rates automatically

# Training loop
epochs = 5000
for epoch in range(epochs):
    optimizer.zero_grad() #clear old gradients
    outputs = model(X) #pass inputs through network to get predicted o/p
    loss = criterion(outputs, Y) #Calculate the loss (how wrong the model is).
    loss.backward() #Compute gradients (chain rule applied automatically).
    optimizer.step() #Adjust model weights based on gradients.
 
    if epoch % 500 == 0:
        print(f'Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}')
        #Printing loss every 5000 epochs

# Test the model
with torch.no_grad():
    predictions = model(X)
    predicted_labels = (predictions > 0.5).float()
    #If prediction > 0.5, consider it 1, otherwise 0 it is called as thresholding
    print("\nPredictions:\n", predicted_labels)


Epoch [0/5000], Loss: 0.7037
Epoch [500/5000], Loss: 0.0016
Epoch [1000/5000], Loss: 0.0005
Epoch [1500/5000], Loss: 0.0003
Epoch [2000/5000], Loss: 0.0002
Epoch [2500/5000], Loss: 0.0001
Epoch [3000/5000], Loss: 0.0001
Epoch [3500/5000], Loss: 0.0000
Epoch [4000/5000], Loss: 0.0000
Epoch [4500/5000], Loss: 0.0000

Predictions:
 tensor([[0.],
        [1.],
        [1.],
        [0.]])
