**3 Layer Nueral Network implementation using  Keras**

In [None]:
import os
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Ensure TensorFlow uses only the CPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# 1. Load and preprocess the MNIST dataset
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Normalize pixel values to be between 0 and 1
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

# 2. Define the 3-layer Neural Network model
# Input: 28x28 images flattened to 784 features
# Hidden Layer: 128 neurons, ReLU activation
# Output Layer: 10 neurons (for 10 classes), softmax activation
model_keras = keras.Sequential([
    keras.Input(shape=(28, 28)),
    layers.Flatten(),
    layers.Dense(128, activation="relu", name="hidden_layer"),
    layers.Dense(10, activation="softmax", name="output_layer")
])

# Display the model summary
print("Keras Model Summary:")
model_keras.summary()

# 3. Compile the model
model_keras.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# 4. Train the model and measure time
print("\nTraining Keras model on CPU...")
start_time_keras = time.time()
history_keras = model_keras.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=5, # Reduced epochs for faster comparison
    validation_split=0.2,
    verbose=0 # Suppress verbose output during training
)
end_time_keras = time.time()
keras_training_time = end_time_keras - start_time_keras
print(f"Keras Training Time (CPU): {keras_training_time:.2f} seconds")

# 5. Evaluate the model on the test data
test_loss_keras, test_accuracy_keras = model_keras.evaluate(x_test, y_test, verbose=0)
print(f"Keras Test Accuracy (CPU): {test_accuracy_keras:.4f}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Keras Model Summary:



Training Keras model on CPU...
Keras Training Time (CPU): 18.47 seconds
Keras Test Accuracy (CPU): 0.9724


**3 Layer Nueral network using Numpy implementation**

In [None]:
# Re-load and preprocess data for NumPy model (flatten directly)
(x_train_np, y_train_np), (x_test_np, y_test_np) = keras.datasets.mnist.load_data()

# Normalize and flatten images
x_train_np = x_train_np.reshape(x_train_np.shape[0], -1).astype("float32") / 255.0
x_test_np = x_test_np.reshape(x_test_np.shape[0], -1).astype("float32") / 255.0

# One-hot encode labels for cross-entropy calculation
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]

y_train_one_hot = one_hot_encode(y_train_np, 10)
y_test_one_hot = one_hot_encode(y_test_np, 10)

# Neural Network Architecture Parameters
input_size = 784
hidden_size = 128
output_size = 10
learning_rate = 0.01
epochs_np = 5 # Reduced epochs for faster comparison
batch_size_np = 64

# Initialize weights and biases
np.random.seed(42) # for reproducibility
weights_input_hidden = np.random.randn(input_size, hidden_size) * 0.01
bias_hidden = np.zeros((1, hidden_size))
weights_hidden_output = np.random.randn(hidden_size, output_size) * 0.01
bias_output = np.zeros((1, output_size))

# Activation Functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # for numerical stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Loss Function: Categorical Cross-Entropy
def categorical_crossentropy(predictions, targets):
    num_samples = predictions.shape[0]
    # Clip predictions to avoid log(0)
    predictions = np.clip(predictions, 1e-12, 1. - 1e-12)
    loss = -np.sum(targets * np.log(predictions)) / num_samples
    return loss

# Training Loop
print("\nTraining NumPy model (from scratch)...")
start_time_np = time.time()

for epoch in range(epochs_np):
    # Shuffle training data for each epoch
    permutation = np.random.permutation(x_train_np.shape[0])
    x_train_shuffled = x_train_np[permutation]
    y_train_shuffled = y_train_one_hot[permutation]

    for i in range(0, x_train_np.shape[0], batch_size_np):
        x_batch = x_train_shuffled[i:i+batch_size_np]
        y_batch = y_train_shuffled[i:i+batch_size_np]

        # Forward Propagation
        # Layer 1 (Input to Hidden)
        z1 = np.dot(x_batch, weights_input_hidden) + bias_hidden
        a1 = relu(z1)

        # Layer 2 (Hidden to Output)
        z2 = np.dot(a1, weights_hidden_output) + bias_output
        a2 = softmax(z2) # Output probabilities

        # Backward Propagation
        # Output Layer (dL/da2 * da2/dz2)
        # Gradient of loss w.r.t. z2
        dz2 = a2 - y_batch # Derivative of cross-entropy with softmax

        # Gradients for weights_hidden_output and bias_output
        d_weights_hidden_output = np.dot(a1.T, dz2)
        d_bias_output = np.sum(dz2, axis=0, keepdims=True)

        # Hidden Layer (dL/da1 * da1/dz1)
        # Gradient of loss w.r.t. a1
        da1 = np.dot(dz2, weights_hidden_output.T)
        # Gradient of loss w.r.t. z1
        dz1 = da1 * relu_derivative(z1)

        # Gradients for weights_input_hidden and bias_hidden
        d_weights_input_hidden = np.dot(x_batch.T, dz1)
        d_bias_hidden = np.sum(dz1, axis=0, keepdims=True)

        # Update Weights and Biases (SGD)
        weights_hidden_output -= learning_rate * d_weights_hidden_output
        bias_output -= learning_rate * d_bias_output
        weights_input_hidden -= learning_rate * d_weights_input_hidden
        bias_hidden -= learning_rate * d_bias_hidden

end_time_np = time.time()
numpy_training_time = end_time_np - start_time_np
print(f"NumPy Training Time (from scratch): {numpy_training_time:.2f} seconds")

# Evaluate NumPy model
# Forward pass for test data
z1_test = np.dot(x_test_np, weights_input_hidden) + bias_hidden
a1_test = relu(z1_test)
z2_test = np.dot(a1_test, weights_hidden_output) + bias_output
a2_test = softmax(z2_test)

predictions_np = np.argmax(a2_test, axis=1)
accuracy_np = np.mean(predictions_np == y_test_np)
print(f"NumPy Test Accuracy (from scratch): {accuracy_np:.4f}")



Training NumPy model (from scratch)...
NumPy Training Time (from scratch): 8.58 seconds
NumPy Test Accuracy (from scratch): 0.9750


**comparison of  keras and Numpy implemenation speed-up **

In [None]:
print(f"\n--- Speed-Up Analysis ---")
print(f"Keras Training Time (CPU): {keras_training_time:.2f} seconds")
print(f"NumPy Training Time (from scratch): {numpy_training_time:.2f} seconds")

if numpy_training_time > 0:
    speed_up = numpy_training_time / keras_training_time
    print(f"Speed-up of Keras (CPU) over NumPy (from scratch): {speed_up:.2f}x")
else:
    print("NumPy training time was zero, cannot calculate speed-up.")



--- Speed-Up Analysis ---
Keras Training Time (CPU): 18.47 seconds
NumPy Training Time (from scratch): 8.58 seconds
Speed-up of Keras (CPU) over NumPy (from scratch): 0.46x


**Tiling for Matrix multiplication**

In [None]:
def naive_matrix_multiply(A, B):
    """Naive matrix multiplication without any optimizations."""
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    if cols_A != rows_B:
        raise ValueError("Matrices A and B cannot be multiplied.")

    C = np.zeros((rows_A, cols_B))
    for i in range(rows_A):
        for j in range(cols_B):
            for k in range(cols_A):
                C[i, j] += A[i, k] * B[k, j]
    return C

def tiled_matrix_multiply(A, B, block_size):
    """Matrix multiplication with tiling optimization."""
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    if cols_A != rows_B:
        raise ValueError("Matrices A and B cannot be multiplied.")

    C = np.zeros((rows_A, cols_B))

    for i1 in range(0, rows_A, block_size):
        for j1 in range(0, cols_B, block_size):
            for k1 in range(0, cols_A, block_size):
                # Process block
                for i in range(i1, min(i1 + block_size, rows_A)):
                    for j in range(j1, min(j1 + block_size, cols_B)):
                        for k in range(k1, min(k1 + block_size, cols_A)):
                            C[i, j] += A[i, k] * B[k, j]
    return C

# Generate large random matrices for benchmarking
matrix_size = 256 # For demonstration, choose a size that runs in reasonable time
A = np.random.rand(matrix_size, matrix_size)
B = np.random.rand(matrix_size, matrix_size)
block_size = 32 # Example block size

print(f"\n--- Matrix Multiplication Benchmarking (Size: {matrix_size}x{matrix_size}) ---")

# Benchmark Naive
start_time = time.time()
C_naive = naive_matrix_multiply(A, B)
time_naive = time.time() - start_time
print(f"Naive matrix multiply time: {time_naive:.4f} seconds")

# Benchmark Tiled
start_time = time.time()
C_tiled = tiled_matrix_multiply(A, B, block_size)
time_tiled = time.time() - start_time
print(f"Tiled matrix multiply (block={block_size}) time: {time_tiled:.4f} seconds")

# Benchmark NumPy's optimized dot
start_time = time.time()
C_np = np.dot(A, B)
time_np = time.time() - start_time
print(f"NumPy np.dot time: {time_np:.4f} seconds")

# Verify correctness (optional)
# print(f"Are naive and numpy results close? {np.allclose(C_naive, C_np)}")
# print(f"Are tiled and numpy results close? {np.allclose(C_tiled, C_np)}")



--- Matrix Multiplication Benchmarking (Size: 256x256) ---
Naive matrix multiply time: 12.5226 seconds
Tiled matrix multiply (block=32) time: 12.6191 seconds
NumPy np.dot time: 0.0022 seconds


**GPU Implementation of 3 layer NN**

In [None]:
pip install cupy

Collecting cupy
  Downloading cupy-13.6.0.tar.gz (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: cupy
  Building wheel for cupy (pyproject.toml) ... [?25l[?25hdone
  Created wheel for cupy: filename=cupy-13.6.0-cp312-cp312-linux_x86_64.whl size=95231240 sha256=e78ff5db28912c32cbce0656cbee28c1d12e84bd721df62306e21e8de3623c5b
  Stored in directory: /root/.cache/pip/wheels/59/62/e2/466b4487b8c7ef9b9345937b46481dd5a58c67252ccd828c75
Successfully built cupy
Installing collected packages: cupy
Successfully installed cupy-13.6.0


In [None]:
import os
import time
import numpy as np
import tensorflow as tf # Used only for loading MNIST data
from tensorflow import keras

# Attempt to import CuPy and check for GPU availability
try:
    import cupy as cp
    print("CuPy imported successfully.")
    if cp.cuda.is_available():
        print("CUDA GPU is available and detected by CuPy.")
        # Set the default array library to CuPy for convenience
        array_lib = cp
        # Set default device to GPU (optional, CuPy uses it by default if available)
        cp.cuda.Device(0).use()
    else:
        print("CUDA GPU not available. Falling back to NumPy (CPU).")
        array_lib = np
except ImportError:
    print("CuPy not installed. Falling back to NumPy (CPU). Install CuPy for GPU acceleration.")
    array_lib = np

# Ensure TensorFlow uses only the CPU for data loading if CuPy is used
# This prevents TensorFlow from potentially grabbing GPU resources before CuPy
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# 1. Load and preprocess the MNIST dataset using TensorFlow/Keras
(x_train_np, y_train_np), (x_test_np, y_test_np) = keras.datasets.mnist.load_data()

# Normalize and flatten images
x_train_np = x_train_np.reshape(x_train_np.shape[0], -1).astype("float32") / 255.0
x_test_np = x_test_np.reshape(x_test_np.shape[0], -1).astype("float32") / 255.0

# One-hot encode labels for cross-entropy calculation
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels] # Use NumPy for one-hot encoding on CPU first

y_train_one_hot_np = one_hot_encode(y_train_np, 10)
y_test_one_hot_np = one_hot_encode(y_test_np, 10)

# 2. Transfer data to GPU (if CuPy is active)
# If array_lib is cp, these will be CuPy arrays on GPU
# Otherwise, they remain NumPy arrays on CPU
x_train = array_lib.asarray(x_train_np)
y_train_one_hot = array_lib.asarray(y_train_one_hot_np)
x_test = array_lib.asarray(x_test_np)
y_test = array_lib.asarray(y_test_np) # Keep y_test_np for final comparison, as array_lib is for operations

# Neural Network Architecture Parameters
input_size = 784
hidden_size = 128
output_size = 10
learning_rate = 0.01
epochs_gpu = 10 # More epochs for better accuracy if GPU is fast
batch_size_gpu = 128 # Larger batch size often beneficial on GPU

# Initialize weights and biases directly on the GPU (if CuPy)
array_lib.random.seed(42) # for reproducibility
weights_input_hidden = array_lib.random.randn(input_size, hidden_size) * 0.01
bias_hidden = array_lib.zeros((1, hidden_size))
weights_hidden_output = array_lib.random.randn(hidden_size, output_size) * 0.01
bias_output = array_lib.zeros((1, output_size))

# Activation Functions (using array_lib, which will be cp or np)
def relu(x):
    return array_lib.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(array_lib.float32) # Ensure type consistency

def softmax(x):
    # for numerical stability (CuPy's max is like NumPy's)
    exp_x = array_lib.exp(x - array_lib.max(x, axis=1, keepdims=True))
    return exp_x / array_lib.sum(exp_x, axis=1, keepdims=True)

# Loss Function: Categorical Cross-Entropy
def categorical_crossentropy(predictions, targets):
    num_samples = predictions.shape[0]
    # Clip predictions to avoid log(0)
    predictions = array_lib.clip(predictions, 1e-12, 1. - 1e-12)
    loss = -array_lib.sum(targets * array_lib.log(predictions)) / num_samples
    return loss

# Training Loop
print(f"\nTraining from-scratch model on { 'GPU' if array_lib is cp else 'CPU' }...")
start_time_gpu = time.time()

for epoch in range(epochs_gpu):
    # Shuffle training data for each epoch
    # Create permutation on CPU, then transfer indices to GPU
    permutation_indices = np.random.permutation(x_train.shape[0])
    # Use CuPy's advanced indexing
    x_train_shuffled = x_train[array_lib.asarray(permutation_indices)]
    y_train_shuffled = y_train_one_hot[array_lib.asarray(permutation_indices)]

    for i in range(0, x_train.shape[0], batch_size_gpu):
        x_batch = x_train_shuffled[i:i+batch_size_gpu]
        y_batch = y_train_shuffled[i:i+batch_size_gpu]

        # Forward Propagation
        z1 = array_lib.dot(x_batch, weights_input_hidden) + bias_hidden
        a1 = relu(z1)

        z2 = array_lib.dot(a1, weights_hidden_output) + bias_output
        a2 = softmax(z2)

        # Backward Propagation
        dz2 = a2 - y_batch

        d_weights_hidden_output = array_lib.dot(a1.T, dz2)
        d_bias_output = array_lib.sum(dz2, axis=0, keepdims=True)

        da1 = array_lib.dot(dz2, weights_hidden_output.T)
        dz1 = da1 * relu_derivative(z1)

        d_weights_input_hidden = array_lib.dot(x_batch.T, dz1)
        d_bias_hidden = array_lib.sum(dz1, axis=0, keepdims=True)

        # Update Weights and Biases (SGD)
        weights_hidden_output -= learning_rate * d_weights_hidden_output
        bias_output -= learning_rate * d_bias_output
        weights_input_hidden -= learning_rate * d_weights_input_hidden
        bias_hidden -= learning_rate * d_bias_hidden

    # Optional: Print loss/accuracy per epoch to monitor progress
    if epoch % 2 == 0:
        # Calculate loss on a small validation set or full training set
        # For simplicity, let's just calculate training loss
        z1_train = array_lib.dot(x_train, weights_input_hidden) + bias_hidden
        a1_train = relu(z1_train)
        z2_train = array_lib.dot(a1_train, weights_hidden_output) + bias_output
        a2_train = softmax(z2_train)
        train_loss = categorical_crossentropy(a2_train, y_train_one_hot)
        print(f"Epoch {epoch+1}/{epochs_gpu}, Train Loss: {train_loss:.4f}")


end_time_gpu = time.time()
gpu_training_time = end_time_gpu - start_time_gpu
print(f"From-scratch training time on { 'GPU' if array_lib is cp else 'CPU' }: {gpu_training_time:.2f} seconds")

# Evaluate the model on the test data
# Forward pass for test data
z1_test = array_lib.dot(x_test, weights_input_hidden) + bias_hidden
a1_test = relu(z1_test)
z2_test = array_lib.dot(a1_test, weights_hidden_output) + bias_output
a2_test = softmax(z2_test)

# Move predictions back to CPU for comparison with NumPy y_test_np
predictions_gpu = array_lib.argmax(a2_test, axis=1).get() # .get() moves CuPy array to NumPy array on CPU
accuracy_gpu = np.mean(predictions_gpu == y_test_np)
print(f"From-scratch Test Accuracy on { 'GPU' if array_lib is cp else 'CPU' }: {accuracy_gpu:.4f}")

# --- Speed-Up Analysis (requires running the previous Keras and NumPy CPU codes) ---
# Assuming you have `keras_training_time` and `numpy_training_time` from previous runs
# For demonstration purposes, let's define dummy values if not run
 keras_training_time = 12.0 # Replace with actual
 numpy_training_time = 200.0 # Replace with actual

 print(f"\n--- Speed-Up Analysis ---")
 print(f"Keras Training Time (CPU): {keras_training_time:.2f} seconds")
 print(f"NumPy Training Time (from scratch, CPU): {numpy_training_time:.2f} seconds")
 print(f"From-scratch Training Time ({ 'GPU' if array_lib is cp else 'CPU' }): {gpu_training_time:.2f} seconds")

 if array_lib is cp:
     if numpy_training_time > 0:
         speed_up_gpu_over_numpy_cpu = numpy_training_time / gpu_training_time
         print(f"Speed-up of From-scratch GPU over From-scratch NumPy (CPU): {speed_up_gpu_over_numpy_cpu:.2f}x")
     if keras_training_time > 0:
         speed_up_gpu_over_keras_cpu = keras_training_time / gpu_training_time
         print(f"Speed-up of From-scratch GPU over Keras (CPU): {speed_up_gpu_over_keras_cpu:.2f}x")
 else:
     print("GPU not available, cannot calculate GPU speed-up.")
