In [None]:
### NN for Binary Classification ###

In [None]:
!pip install scikit-learn

In [12]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn import datasets
import numpy as np

In [52]:
np.random.seed(0)
X, y = datasets.make_moons(100, noise=0.10)
x1 = X[:,0]
x2 = X[:,1]
y = y.reshape(y.shape[0],1)
print(X.shape)
print(y.shape)

(100, 2)
(100, 1)


In [None]:

### Initialize weights and biases in NN ###

def define_parameters(weights):
    weight_list = []
    bias_list = []
    
    for i in range(len(weights) - 1):
        w = np.random.randn(weights[i], weights[i+1])
        b = np.random.randn()
        weight_list.append(w)
        bias_list.append(b)
        
    return weight_list, bias_list

### activation function ###

def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_der(x):
    return sigmoid(x)*(1-sigmoid(x))

### feed forward algorithm ###

def predictions(w, b, X):
    zh = np.dot(X,w[0]) + b[0]
    ah = sigmoid(zh)
    zo = np.dot(ah, w[1]) + b[1]
    ao = sigmoid(zo)
    return ao

### cost function (MSE) ###

def find_cost(ao,y):
    m = y.shape[0]
    total_cost = (1/m) * np.sum(np.square(ao - y))
    return total_cost

### back propagation ###

def find_derivatives(w, b, X):
    zh = np.dot(X,w[0]) + b[0]
    ah = sigmoid(zh)
    
    zo = np.dot(ah, w[1]) + b[1]
    ao = sigmoid(zo)

    # Backpropagation phase 1
    m = y.shape[0]
    dcost_dao = (1/m)*(ao-y)
    dao_dzo = sigmoid_der(zo)
    dzo_dwo = ah.T

    dwo = np.dot(dzo_dwo, dcost_dao * dao_dzo)
    dbo = np.sum(dcost_dao * dao_dzo)
    
    # Backpropagation phase 2

    # dcost_wh = dcost_dah * dah_dzh * dzh_dwh
    # dcost_dah = dcost_dzo * dzo_dah

    dcost_dzo = dcost_dao * dao_dzo
    dzo_dah = w[1].T

    dcost_dah = np.dot(dcost_dzo , dzo_dah)

    dah_dzh = sigmoid_der(zh)
    dzh_dwh = X.T
    dwh = np.dot(dzh_dwh, dah_dzh * dcost_dah)
    dbh = np.sum(dah_dzh * dcost_dah)

    return dwh, dbh, dwo, dbo

### Update weights from gradients ###

def update_weights(w,b,dwh, dbh, dwo, dbo, lr):
    w[0] = w[0] - lr * dwh
    w[1] = w[1] - lr * dwo

    b[0] = b[0] - lr * dbh
    b[1] = b[1] - lr * dbo

    return w, b

### NN class for training ###

def my_neural_network(X, y, lr, epochs):
    error_list = []
    input_len = X.shape[1]
    output_len = y.shape[1]
    w,b = define_parameters([input_len, 4, output_len])

    for i in range(epochs):
        ao = predictions(w, b, X)
        cost = find_cost(ao, y)
        error_list.append(cost)
        dwh, dbh, dwo, dbo = find_derivatives (w, b, X)
        w, b = update_weights(w, b, dwh, dbh, dwo, dbo, lr )
        
        if i % 50 == 0:
            print(cost)
    return w, b, error_list

### set hyperparameters and train NN ###

lr = 0.5
epochs = 10
w, b, error_list = my_neural_network(X,y,lr,epochs)
print('weight:', w)
print('bias:', b)
print('last 10 cost values:', error_list)

In [5]:
### NN for multiclass Classification 3 outputs ###

In [None]:
np.random.seed(42)

cat1 = np.random.randn(800, 2) + np.array([0, -3])
cat2 = np.random.randn(800, 2) + np.array([3, 3])
cat3 = np.random.randn(800, 2) + np.array([-3, 3])

X = np.vstack([cat1, cat2, cat3])

labels = np.array([0]*800 + [1]*800 + [2]*800)

y = np.zeros((2400, 3))

for i in range(2400):
    y[i, labels[i]] = 1

print(X.shape)
print(y.shape)

x1 = X[:,0]
x2 = X[:,1]

### Initialize weights and biases in NN ###

def define_parameters(weights):
    weight_list = []
    bias_list = []
    for i in range(len(weights) - 1):
        w = np.random.randn(weights[i], weights[i+1])
        b = np.random.randn(weights[i+1])
        weight_list.append(w)
        bias_list.append(b)
    return weight_list, bias_list

### output layer(softmax function) ###

def softmax(X):
    expX = np.exp(X)
    return expX / expX.sum(axis=1, keepdims=True)


### activation function ###

def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_der(x):
    return sigmoid(x)*(1-sigmoid(x))

### feed forward ###

def predictions(w, b, X):
    zh = np.dot(X,w[0]) + b[0]
    ah = sigmoid(zh)

    zo = np.dot(ah, w[1]) + b[1]
    ao = softmax(zo)
    return ao


### Cost Functions ###

def find_cost(ao,y):

    total_cost = np.sum(-y * np.log(ao))
    return total_cost

### backpropagation ###

def find_derivatives(w, b, X):

    zh = np.dot(X,w[0]) + b[0]

    ah = sigmoid(zh)

    zo = np.dot(ah, w[1]) + b[1]
    ao = softmax(zo)

    # Back propagation phase 1


    dcost_dzo = (ao-y)
    dzo_dwo = ah.T

    dwo = np.dot(dzo_dwo, dcost_dzo)
    dbo = np.sum(dcost_dzo)

    # Back propagation phase 2

    # dcost_wh = dcost_dah * dah_dzh * dzh_dwh
    # dcost_dah = dcost_dzo * dzo_dah


    dzo_dah = w[1].T

    dcost_dah = np.dot(dcost_dzo , dzo_dah)

    dah_dzh = sigmoid_der(zh)
    dzh_dwh = X.T
    dwh = np.dot(dzh_dwh, dah_dzh * dcost_dah)
    dbh = np.sum(dah_dzh * dcost_dah)

    return dwh, dbh, dwo, dbo

def update_weights(w,b,dwh, dbh, dwo, dbo, lr):
    w[0] = w[0] - lr * dwh
    w[1] = w[1] - lr * dwo

    b[0] = b[0] - lr * dbh
    b[1] = b[1] - lr * dbo

    return w, b

### Train NN ###

def my_multiout_neural_network(X, y, lr, epochs):

    error_list = []
    input_len = X.shape[1]  # 784 for MNIST
    output_len = y.shape[1] # 10 for MNIST
    w,b = define_parameters([input_len, 4, output_len])


    for i in range(epochs):
        ao = predictions(w, b, X)
        cost = find_cost(ao, y)
        error_list.append(cost)
        dwh, dbh, dwo, dbo = find_derivatives (w, b, X)
        w, b = update_weights(w, b, dwh, dbh, dwo, dbo,lr)
        if i % 50 == 0 :
            print(cost)

    return w, b, error_list


lr = 0.0005
epochs = 10
w, b, error_list = my_multiout_neural_network(X,y,lr,epochs)
print(error_list)

In [None]:
### NN for multiclass Classification for mnist dataset ###

In [None]:


# Function to load MNIST data from npz file
def load_mnist_from_npz(filename):
    with np.load(filename, allow_pickle=True) as npzfile:
        X_train = npzfile['x_train']
        y_train = npzfile['y_train']
        X_test = npzfile['x_test']
        y_test = npzfile['y_test']
    return (X_train, y_train), (X_test, y_test)

# The mnist.npz file should be in the same directory as this script
mnist_npz_filename = 'mnist.npz'

# Load the data
(X_train, y_train), (X_test, y_test) = load_mnist_from_npz(mnist_npz_filename)

# Reshaping and normalizing the data
X_train = X_train.reshape(-1, 28, 28) / 255.0
X_test = X_test.reshape(-1, 28, 28) / 255.0



# Convert labels to one-hot encoding
y_train_one_hot = np.zeros((len(y_train), 10))
y_train_one_hot[np.arange(len(y_train)), y_train] = 1


y_test_one_hot = np.zeros((len(y_test), 10))
y_test_one_hot[np.arange(len(y_test)), y_test] = 1

# Flatten the images for input to the neural network
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

# ... MNIST data loading and preprocessing code ...

### Initialize weights and biases in NN ###

def define_parameters(weights):
    weight_list = []
    bias_list = []
    for i in range(len(weights) - 1):
        w = np.random.randn(weights[i], weights[i+1]) * np.sqrt(1. / weights[i])
        b = np.zeros((weights[i+1]))
        weight_list.append(w)
        bias_list.append(b)
    return weight_list, bias_list

### output layer(softmax function) ###

def softmax(X):
    expX = np.exp(X - np.max(X, axis=1, keepdims=True))
    return expX / expX.sum(axis=1, keepdims=True)

### activation function ###

def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_der(x):
    return sigmoid(x)*(1-sigmoid(x))

### feed forward ###

def predictions(w, b, X):
    zh1 = np.dot(X, w[0]) + b[0]
    ah1 = sigmoid(zh1)

    zh2 = np.dot(ah1, w[1]) + b[1]
    ah2 = sigmoid(zh2)

    zo = np.dot(ah2, w[2]) + b[2]
    ao = softmax(zo)
    # Return both activations and pre-activation values
    return zh1, ah1, zh2, ah2, ao


### Cost Functions ###

def find_cost(ao, y):
    m = y.shape[0]
    total_cost = -np.sum(y * np.log(ao)) / m
    return total_cost

### backpropagation ###

def find_derivatives(w, b, X, y, zh1, ah1, zh2, ah2, ao):
    m = y.shape[0]

    # Back propagation phase 1
    dcost_dzo = ao - y
    dzo_dw2 = ah2.T
    dw2 = np.dot(dzo_dw2, dcost_dzo) / m
    db2 = np.sum(dcost_dzo, axis=0) / m

    # Phase 2
    dcost_dah2 = np.dot(dcost_dzo, w[2].T)
    dah2_dzh2 = sigmoid_der(zh2)
    dzh2_dw1 = ah1.T
    dw1 = np.dot(dzh2_dw1, dcost_dah2 * dah2_dzh2) / m
    db1 = np.sum(dcost_dah2 * dah2_dzh2, axis=0) / m

    # Phase 3
    dcost_dah1 = np.dot(dcost_dah2 * dah2_dzh2, w[1].T)
    dah1_dzh1 = sigmoid_der(zh1)
    dzh1_dw0 = X.T
    dw0 = np.dot(dzh1_dw0, dcost_dah1 * dah1_dzh1) / m
    db0 = np.sum(dcost_dah1 * dah1_dzh1, axis=0) / m

    return dw0, db0, dw1, db1, dw2, db2

def update_weights(w, b, dw0, db0, dw1, db1, dw2, db2, lr):
    w[0] = w[0] - lr * dw0
    b[0] = b[0] - lr * db0

    w[1] = w[1] - lr * dw1
    b[1] = b[1] - lr * db1

    w[2] = w[2] - lr * dw2
    b[2] = b[2] - lr * db2

    return w, b

### Train NN ###

def my_multiout_neural_network(X, y, lr, epochs):
    error_list = []
    input_len = X.shape[1]  # 784 for MNIST
    output_len = y.shape[1] # 10 for MNIST
    w, b = define_parameters([input_len, 200, 50, output_len])

    for i in range(epochs):
        zh1, ah1, zh2, ah2, ao = predictions(w, b, X)
        cost = find_cost(ao, y)
        error_list.append(cost)
        # Pass zh1 and zh2 to find_derivatives
        dw0, db0, dw1, db1, dw2, db2 = find_derivatives(w, b, X, y, zh1, ah1, zh2, ah2, ao)
        w, b = update_weights(w, b, dw0, db0, dw1, db1, dw2, db2, lr)
        if i % 50 == 0 :
            print(f'Epoch {i}: cost = {cost}')

    return w, b, error_list


# Training parameters
lr = 0.005
epochs = 10
w, b, error_list = my_multiout_neural_network(X_train_flattened, y_train_one_hot, lr, epochs)
print(error_list)

# Evaluate on test data
# You would typically have an evaluation function here
# For simplicity, we will just print the shape of the test data
print(X_test_flattened.shape)


In [None]:
### updated based on pytorch implementation ###

In [9]:
import numpy as np
import time

def load_and_preprocess_mnist(filename):
    # Load data
    with np.load(filename, allow_pickle=True) as npzfile:
        X_train = npzfile['x_train']
        y_train = npzfile['y_train']
        X_test = npzfile['x_test']
        y_test = npzfile['y_test']
    return (X_train, y_train), (X_test, y_test)
    
    X_train_normalized = X_train.reshape(-1, 784) / 255.0
    X_test_normalized = X_test.reshape(-1, 784) / 255.0

    # Shift to have a mean of 0 and standard deviation of 1
    X_train_normalized = (X_train_normalized - np.mean(X_train_normalized)) / np.std(X_train_normalized)
    X_test_normalized = (X_test_normalized - np.mean(X_test_normalized)) / np.std(X_test_normalized)
    
    # One-hot encode labels in vector form
    y_train_one_hot = np.eye(10)[y_train]
    y_test_one_hot = np.eye(10)[y_test]

    return (X_train, y_train_one_hot), (X_test, y_test_one_hot)

mnist_npz_filename = 'mnist.npz'
(X_train, y_train), (X_test, y_test) = load_and_preprocess_mnist(mnist_npz_filename)

        
### summary statistics ###
def calculate_accuracy(X, y, w, b):
    _, _, _, _, pred = predictions(w, b, X) 
    pred = np.argmax(pred, axis=1)
    labels = np.argmax(y, axis=1)
    accuracy = np.mean(pred == labels)
    return accuracy * 100


### Initialize weights and biases in NN ###

def define_parameters(weights):
    weight_list = []
    bias_list = []
    for i in range(len(weights) - 1):
        w = np.random.randn(weights[i], weights[i+1]) * np.sqrt(1. / weights[i])
        b = np.zeros((weights[i+1]))
        weight_list.append(w)
        bias_list.append(b)
    return weight_list, bias_list

### output layer(softmax function) ###

def softmax(X):
    expX = np.exp(X - np.max(X, axis=1, keepdims=True))
    return expX / expX.sum(axis=1, keepdims=True)

### activation function ReLU ###

def relu(x):
    return np.maximum(0, x)

def relu_der(x):
    return (x > 0).astype(x.dtype)

### feed forward with ReLU ###
def predictions(w, b, X):
    zh1 = np.dot(X, w[0]) + b[0]
    ah1 = relu(zh1)

    zh2 = np.dot(ah1, w[1]) + b[1]
    ah2 = relu(zh2)

    zo = np.dot(ah2, w[2]) + b[2]
    ao = softmax(zo)
    return zh1, ah1, zh2, ah2, ao


### stochastic gradient descent ### 
def batch_generator(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        yield X[i:i + batch_size], y[i:i + batch_size]

### Cross-Entropy Loss ###
def cross_entropy_loss(ao, y):
    m = y.shape[0]
    # Adding a small number to avoid log(0)
    return -np.sum(y * np.log(ao + 1e-9)) / m

### Backpropagation with ReLU ###
def find_derivatives(w, b, X, y, zh1, ah1, zh2, ah2, ao):
    m = y.shape[0]
    
    # phase 1
    dcost_dzo = ao - y
    dzo_dw2 = ah2.T
    dw2 = np.dot(dzo_dw2, dcost_dzo) / m
    db2 = np.sum(dcost_dzo, axis=0) / m

    # phase 2
    dcost_dah2 = np.dot(dcost_dzo, w[2].T)
    dah2_dzh2 = relu_der(zh2)
    dzh2_dw1 = ah1.T
    dw1 = np.dot(dzh2_dw1, dcost_dah2 * dah2_dzh2) / m
    db1 = np.sum(dcost_dah2 * dah2_dzh2, axis=0) / m

    # phase 3
    dcost_dah1 = np.dot(dcost_dah2 * dah2_dzh2, w[1].T)
    dah1_dzh1 = relu_der(zh1)
    dzh1_dw0 = X.T
    dw0 = np.dot(dzh1_dw0, dcost_dah1 * dah1_dzh1) / m
    db0 = np.sum(dcost_dah1 * dah1_dzh1, axis=0) / m

    return dw0, db0, dw1, db1, dw2, db2


def update_weights(w, b, dw0, db0, dw1, db1, dw2, db2, lr):
    w[0] = w[0] - lr * dw0
    b[0] = b[0] - lr * db0

    w[1] = w[1] - lr * dw1
    b[1] = b[1] - lr * db1

    w[2] = w[2] - lr * dw2
    b[2] = b[2] - lr * db2

    return w, b



### Train NN ###

def my_multiout_neural_network(X_train, y_train, X_test, y_test, lr, epochs, batch_size):
    start_time = time.time()
    error_list = []
    input_len = X_train.shape[1]
    output_len = y_train.shape[1]
    
    # Structure 784, 200, 50, 10
    w, b = define_parameters([input_len, 200, 50, output_len])

    for epoch in range(epochs):
        # break epoch up into batches of 128
        for X_batch, y_batch in batch_generator(X_train, y_train, batch_size):
            
            zh1, ah1, zh2, ah2, ao = predictions(w, b, X_batch)
            # calculate loss
            cost = cross_entropy_loss(ao, y_batch)
            dw0, db0, dw1, db1, dw2, db2 = find_derivatives(w, b, X_batch, y_batch, zh1, ah1, zh2, ah2, ao)
            # minimize loss based on gradients
            w, b = update_weights(w, b, dw0, db0, dw1, db1, dw2, db2, lr)
        
        error_list.append(cost)
        if epoch % 50 == 0:
            print(f'Epoch {epoch}: cost = {cost}')

    training_time = time.time() - start_time
    
    
    train_accuracy = calculate_accuracy(X_train, y_train, w, b)
    test_accuracy = calculate_accuracy(X_test, y_test, w, b)

    return w, b, error_list, training_time, train_accuracy, test_accuracy

# hyperparameters
lr = 0.01
epochs = 10
batch_size = 128

w, b, error_list, training_time, train_accuracy, test_accuracy = my_multiout_neural_network(X_train_flattened, y_train_one_hot, X_test_flattened, y_test_one_hot, lr, epochs, batch_size)

print(f"Training Time: {training_time:.4f} seconds")
print(f"Train Accuracy: {train_accuracy:.4f}%")
print(f"Test Accuracy: {test_accuracy:.4f}%")


Epoch 0: cost = 0.5483631522608613
Training Time: 43.403562784194946 seconds
Train Accuracy: 94.07666666666667%
Test Accuracy: 93.95%
