<a href="https://colab.research.google.com/github/bertankofon/CharacterRecognition/blob/main/MiniProject1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Preparation

In [None]:
import numpy as np
from keras.datasets import mnist

# Load the dataset
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the images.
train_images = (train_images / 255) - 0.5
test_images = (test_images / 255) - 0.5

# Flatten the images.
train_images = train_images.reshape((-1, 784))
test_images = test_images.reshape((-1, 784))

print("Training images shape:", train_images.shape) # Should be (60000, 784)
print("Test images shape:", test_images.shape) # Should be (10000, 784)


Training images shape: (60000, 784)
Test images shape: (10000, 784)


#Model Implemantation

Case 1: Use tanh for all activations.

Case 2: Use ReLU for the hidden layer activation and sigmoid for the output layer activation.

Define Activation Functions and their derivatives

In [None]:
# Tanh activation and its derivative
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

# ReLU activation and its derivative
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

# Define the sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of the sigmoid function for backpropagation
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))



Case 1: Use tanh for all activations.

In [None]:
# Set the random seed for reproducibility
np.random.seed(42)

# Initialize parameters to random values
input_size = 784  # MNIST images are 28x28 pixels
hidden_size = 500  # N = (300, 500 or 1000) CHANGABLE
num_classes = 10  # There are 10 classes for the digits 0-9

# Weights and biases for the hidden layer
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))

# Weights and biases for the output layer
W2 = np.random.randn(hidden_size, num_classes) * 0.01
b2 = np.zeros((1, num_classes))

# Define the learning rate
learning_rate = 1e-1 # CHANGE

# Forward pass using tanh
def forward_pass_tanh(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = tanh(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = tanh(Z2)
    return Z1, A1, Z2, A2


# Compute the cross-entropy loss
def compute_loss(Y, A2):
    m = Y.shape[0]
    log_likelihood = -np.log(A2[range(m), Y] + 1e-9)
    loss = np.sum(log_likelihood) / m
    return loss


# Backward pass
def backward_pass_tanh(X, Y, Z1, A1, W2, A2):
    m = X.shape[0]
    # Calculate W2 and b2 gradients
    dZ2 = A2
    dZ2[range(m), Y] -= 1
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    # Calculate W1 and b1 gradients
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * tanh_derivative(Z1)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    return dW1, db1, dW2, db2

# Parameters update function
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

# Training loop
num_epochs = 10  # CHANGE
for epoch in range(num_epochs):
    # Forward pass
    Z1, A1, Z2, A2 = forward_pass_tanh(train_images, W1, b1, W2, b2)

    # Compute loss
    cost = compute_loss(train_labels, A2)
    print(f'Epoch {epoch + 1}, cost: {cost}')

     # Backward pass
    dW1, db1, dW2, db2 = backward_pass_tanh(train_images, train_labels, Z1, A1, W2, A2)

    # Update parameters
    W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)


  log_likelihood = -np.log(A2[range(m), Y] + 1e-9)


Epoch 1, cost: nan
Epoch 2, cost: 1.8907039476022958
Epoch 3, cost: nan
Epoch 4, cost: 1.9986157126895001
Epoch 5, cost: 2.0060616021708118
Epoch 6, cost: 1.9155720148424171
Epoch 7, cost: 1.8710431564434014
Epoch 8, cost: 1.8141987957021388
Epoch 9, cost: nan
Epoch 10, cost: nan


Case 2: Use ReLU for the hidden layer activation and sigmoid for the output layer activation.

In [None]:
# Set the random seed for reproducibility
np.random.seed(42)

# Initialize parameters to random values
input_size = 784  # MNIST images are 28x28 pixels
hidden_size = 500  # N = (300, 500 or 1000) CHANGABLE
num_classes = 10  # There are 10 classes for the digits 0-9

# Weights and biases for the hidden layer
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))

# Weights and biases for the output layer
W2 = np.random.randn(hidden_size, num_classes) * 0.01
b2 = np.zeros((1, num_classes))

# Define the learning rate
learning_rate = 1e-1 # CHANGE

# Forward pass using ReLU and sigmoid
def forward_pass_relu_sigmoid(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = sigmoid(Z2)
    return Z1, A1, Z2, A2


# Compute the cross-entropy loss
def compute_loss(Y, A2):
    m = Y.shape[0]
    log_likelihood = -np.log(A2[range(m), Y])
    loss = np.sum(log_likelihood) / m
    return loss

# Backward pass using ReLU and sigmoid
def backward_pass_relu_sigmoid(X, Y, Z1, A1, W2, A2):
    m = X.shape[0]
    # Calculate W2 and b2 gradients
    dZ2 = A2
    dZ2[range(m), Y] -= 1
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    # Calculate W1 and b1 gradients
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    return dW1, db1, dW2, db2

# Parameters update function
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

# Training loop
num_epochs = 10  # CHANGE
for epoch in range(num_epochs):
    # Forward pass
    Z1, A1, Z2, A2 = forward_pass_tanh(train_images, W1, b1, W2, b2)

    # Compute loss
    cost = compute_loss(train_labels, A2)
    print(f'Epoch {epoch + 1}, cost: {cost}')

     # Backward pass
    dW1, db1, dW2, db2 = backward_pass_tanh(train_images, train_labels, Z1, A1, W2, A2)

    # Update parameters
    W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)


  log_likelihood = -np.log(A2[range(m), Y])


Epoch 1, cost: nan
Epoch 2, cost: 1.8907039543580224
Epoch 3, cost: nan
Epoch 4, cost: 1.998615720263118
Epoch 5, cost: 2.006061609848726
Epoch 6, cost: 1.9155720218694516
Epoch 7, cost: 1.8710431631992095
Epoch 8, cost: 1.8141988021171056
Epoch 9, cost: nan
Epoch 10, cost: nan


In [None]:
#NAIVE CODE

import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Define the sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of the sigmoid function for backpropagation
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Initialize parameters to random values
input_size = 784  # MNIST images are 28x28 pixels
hidden_size = 500  # N = (300, 500 or 1000) CHANGABLE
num_classes = 10  # There are 10 classes for the digits 0-9

# Weights and biases for the hidden layer
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))

# Weights and biases for the output layer
W2 = np.random.randn(hidden_size, num_classes) * 0.01
b2 = np.zeros((1, num_classes))

# Define the learning rate
learning_rate = 1e-1 # CHANGABLE

# Forward pass
def forward_pass(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = sigmoid(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = np.exp(Z2) / np.sum(np.exp(Z2), axis=1, keepdims=True)  # Softmax
    return Z1, A1, Z2, A2

# Compute the cross-entropy loss
def compute_loss(Y, A2):
    m = Y.shape[0]
    log_likelihood = -np.log(A2[range(m), Y])
    loss = np.sum(log_likelihood) / m
    return loss

# Backward pass
def backward_pass(X, Y, Z1, A1, W2, A2):
    m = X.shape[0]
    # Calculate W2 and b2 gradients
    dZ2 = A2
    dZ2[range(m), Y] -= 1
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    # Calculate W1 and b1 gradients
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * sigmoid_derivative(Z1)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    return dW1, db1, dW2, db2

# Parameters update function
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

# Training loop
num_epochs = 10  # You can change this
for epoch in range(num_epochs):
    # Forward pass
    Z1, A1, Z2, A2 = forward_pass(train_images, W1, b1, W2, b2)

    # Compute loss
    cost = compute_loss(train_labels, A2)
    print(f'Epoch {epoch + 1}, cost: {cost}')

     # Backward pass
    dW1, db1, dW2, db2 = backward_pass(train_images, train_labels, Z1, A1, W2, A2)

    # Update parameters
    W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)


Epoch 1, cost: 2.3054058710849628
Epoch 2, cost: 2.3001994398006462
Epoch 3, cost: 2.299205010144376
Epoch 4, cost: 2.298488539925594
Epoch 5, cost: 2.2977962863320682
Epoch 6, cost: 2.297105549734341
Epoch 7, cost: 2.2964135135430745
Epoch 8, cost: 2.2957194855260736
Epoch 9, cost: 2.2950230616566025
Epoch 10, cost: 2.2943238795094723
