In [1]:
# Install CuPy that matches Colab's CUDA 12.x
# IMPORTANT: After this finishes, restart the session in runtime
%pip -q install -U cupy-cuda12x


In [1]:
!nvidia-smi

import os
print("CUDA_VISIBLE_DEVICES =", os.environ.get("CUDA_VISIBLE_DEVICES"))

import cupy as cp
print("CuPy version:", cp.__version__)
print("CuPy device count:", cp.cuda.runtime.getDeviceCount())
cp.cuda.Device(0).use()
x = cp.arange(5)
print("CuPy test tensor on", x.device, "->", x)


Thu Oct 23 07:17:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# We are using record and get the time dynamically
TIMINGS = {}

def record_time(name, seconds):
    TIMINGS[name] = float(seconds)

def get_time(name, default=None):
    return TIMINGS.get(name, default)

def fmt(s):
    try: return f"{float(s):.2f}"
    except: return "N/A"


**3 Layer Nueral Network implementation using  Keras**

In [3]:
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Ensure TensorFlow uses only the CPU
# (Changed: use TF API so CuPy can still see the GPU in other cells)
try:
    tf.config.set_visible_devices([], 'GPU')
except Exception:
    pass

# 1. Load and preprocess the MNIST dataset
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Normalize pixel values to be between 0 and 1
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

# 2. Define the 3-layer Neural Network model
# Input: 28x28 images flattened to 784 features
# Hidden Layer: 128 neurons, ReLU activation
# Output Layer: 10 neurons (for 10 classes), softmax activation
model_keras = keras.Sequential([
    keras.Input(shape=(28, 28)),
    layers.Flatten(),
    layers.Dense(128, activation="relu", name="hidden_layer"),
    layers.Dense(10, activation="softmax", name="output_layer")
])

# Display the model summary
print("Keras Model Summary:")
model_keras.summary()

# 3. Compile the model
model_keras.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# 4. Train the model and measure time
print("\nTraining Keras model on CPU...")
start_time_keras = time.time()
history_keras = model_keras.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=10,
    validation_split=0.2,
    verbose=0 # Suppress verbose output during training
)
keras_training_time = time.time() - start_time_keras
record_time("keras_cpu", keras_training_time)
print(f"Keras Training Time (CPU): {keras_training_time:.2f} seconds")

# 5. Evaluate the model on the test data
test_loss_keras, test_accuracy_keras = model_keras.evaluate(x_test, y_test, verbose=0)
print(f"Keras Test Accuracy (CPU): {test_accuracy_keras:.4f}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Keras Model Summary:



Training Keras model on CPU...
Keras Training Time (CPU): 25.12 seconds
Keras Test Accuracy (CPU): 0.9778


**3 Layer Nueral network using Numpy implementation**

In [18]:
import time
import numpy as np
from tensorflow import keras

# Re-load and preprocess data for NumPy model (flatten directly)
(x_train_np, y_train_np), (x_test_np, y_test_np) = keras.datasets.mnist.load_data()

# Normalize and flatten images
x_train_np = x_train_np.reshape(x_train_np.shape[0], -1).astype("float32") / 255.0
x_test_np  = x_test_np.reshape(x_test_np.shape[0], -1).astype("float32") / 255.0

# One-hot encode labels for cross-entropy calculation
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes, dtype=np.float32)[labels]

y_train_one_hot = one_hot_encode(y_train_np, 10)
y_test_one_hot  = one_hot_encode(y_test_np, 10)

# Neural Network Architecture Parameters
input_size = 784
hidden_size = 128
output_size = 10
learning_rate = 0.01
epochs_np = 10
batch_size_np = 64

# Initialize weights and biases
np.random.seed(42) # for reproducibility
weights_input_hidden  = (np.random.randn(input_size, hidden_size).astype(np.float32) * 0.01)
bias_hidden           = np.zeros((1, hidden_size), dtype=np.float32)
weights_hidden_output = (np.random.randn(hidden_size, output_size).astype(np.float32) * 0.01)
bias_output           = np.zeros((1, output_size), dtype=np.float32)

# Activation Functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(np.float32)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # for numerical stability
    return exp_x / (np.sum(exp_x, axis=1, keepdims=True) + 1e-12)

# Loss Function: Categorical Cross-Entropy
def categorical_crossentropy(predictions, targets):
    num_samples = predictions.shape[0]
    predictions = np.clip(predictions, 1e-12, 1. - 1e-12)
    return float(-np.sum(targets * np.log(predictions)) / num_samples)

# Training Loop
print("\nTraining NumPy model (from scratch)...")
start_time_np = time.time()

for epoch in range(epochs_np):
    # Shuffle training data for each epoch
    permutation = np.random.permutation(x_train_np.shape[0])
    x_train_shuffled = x_train_np[permutation]
    y_train_shuffled = y_train_one_hot[permutation]

    for i in range(0, x_train_np.shape[0], batch_size_np):
        x_batch = x_train_shuffled[i:i+batch_size_np]
        y_batch = y_train_shuffled[i:i+batch_size_np]

        # Forward Propagation
        # Layer 1 (Input to Hidden)
        z1 = np.dot(x_batch, weights_input_hidden) + bias_hidden
        a1 = relu(z1)

        # Layer 2 (Hidden to Output)
        z2 = np.dot(a1, weights_hidden_output) + bias_output
        a2 = softmax(z2) # Output probabilities

        # Backward Propagation
        dz2 = a2 - y_batch # Derivative of cross-entropy with softmax

        d_weights_hidden_output = np.dot(a1.T, dz2)
        d_bias_output = np.sum(dz2, axis=0, keepdims=True)

        da1 = np.dot(dz2, weights_hidden_output.T)
        dz1 = da1 * relu_derivative(z1)

        d_weights_input_hidden = np.dot(x_batch.T, dz1)
        d_bias_hidden = np.sum(dz1, axis=0, keepdims=True)

        # Update Weights and Biases (SGD)
        weights_hidden_output -= learning_rate * d_weights_hidden_output
        bias_output          -= learning_rate * d_bias_output
        weights_input_hidden -= learning_rate * d_weights_input_hidden
        bias_hidden          -= learning_rate * d_bias_hidden

numpy_training_time = time.time() - start_time_np
record_time("numpy_from_scratch_cpu", numpy_training_time)
print(f"NumPy Training Time (from scratch): {numpy_training_time:.2f} seconds")

# Evaluate NumPy model
z1_test = np.dot(x_test_np, weights_input_hidden) + bias_hidden
a1_test = relu(z1_test)
z2_test = np.dot(a1_test, weights_hidden_output) + bias_output
a2_test = softmax(z2_test)

predictions_np = np.argmax(a2_test, axis=1)
accuracy_np = np.mean(predictions_np == y_test_np)
print(f"NumPy Test Accuracy (from scratch): {accuracy_np:.4f}")



Training NumPy model (from scratch)...
NumPy Training Time (from scratch): 7.79 seconds
NumPy Test Accuracy (from scratch): 0.9776


**comparison of  keras and Numpy implemenation speed-up **

In [19]:
print(f"\n--- Speed-Up Analysis ---")
print(f"Keras Training Time (CPU): {keras_training_time:.2f} seconds")
print(f"NumPy Training Time (from scratch): {numpy_training_time:.2f} seconds")

if numpy_training_time > 0:
    speed_up = numpy_training_time / keras_training_time
    print(f"Speed-up of Keras (CPU) over NumPy (from scratch): {speed_up:.2f}x")
else:
    print("NumPy training time was zero, cannot calculate speed-up.")



--- Speed-Up Analysis ---
Keras Training Time (CPU): 25.12 seconds
NumPy Training Time (from scratch): 7.79 seconds
Speed-up of Keras (CPU) over NumPy (from scratch): 0.31x


**Tiling for Matrix multiplication**

In [11]:
def naive_matrix_multiply(A, B):
    """Naive matrix multiplication without any optimizations."""
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    if cols_A != rows_B:
        raise ValueError("Matrices A and B cannot be multiplied.")

    C = np.zeros((rows_A, cols_B))
    for i in range(rows_A):
        for j in range(cols_B):
            for k in range(cols_A):
                C[i, j] += A[i, k] * B[k, j]
    return C

def tiled_matrix_multiply(A, B, block_size):
    """Matrix multiplication with tiling optimization."""
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    if cols_A != rows_B:
        raise ValueError("Matrices A and B cannot be multiplied.")

    C = np.zeros((rows_A, cols_B))

    for i1 in range(0, rows_A, block_size):
        for j1 in range(0, cols_B, block_size):
            for k1 in range(0, cols_A, block_size):
                # Process block
                for i in range(i1, min(i1 + block_size, rows_A)):
                    for j in range(j1, min(j1 + block_size, cols_B)):
                        for k in range(k1, min(k1 + block_size, cols_A)):
                            C[i, j] += A[i, k] * B[k, j]
    return C

# Generate large random matrices for benchmarking
matrix_size = 256 # For demonstration, choose a size that runs in reasonable time
A = np.random.rand(matrix_size, matrix_size)
B = np.random.rand(matrix_size, matrix_size)
block_size = 32 # Example block size

print(f"\n--- Matrix Multiplication Benchmarking (Size: {matrix_size}x{matrix_size}) ---")

# Benchmark Naive
start_time = time.time()
C_naive = naive_matrix_multiply(A, B)
time_naive = time.time() - start_time
print(f"Naive matrix multiply time: {time_naive:.4f} seconds")

# Benchmark Tiled
start_time = time.time()
C_tiled = tiled_matrix_multiply(A, B, block_size)
time_tiled = time.time() - start_time
print(f"Tiled matrix multiply (block={block_size}) time: {time_tiled:.4f} seconds")

# Benchmark NumPy's optimized dot
start_time = time.time()
C_np = np.dot(A, B)
time_np = time.time() - start_time
print(f"NumPy np.dot time: {time_np:.4f} seconds")

# Verify correctness (optional)
# print(f"Are naive and numpy results close? {np.allclose(C_naive, C_np)}")
# print(f"Are tiled and numpy results close? {np.allclose(C_tiled, C_np)}")



--- Matrix Multiplication Benchmarking (Size: 256x256) ---
Naive matrix multiply time: 10.7569 seconds
Tiled matrix multiply (block=32) time: 10.6417 seconds
NumPy np.dot time: 0.0030 seconds



**GPU Implementation of 3 layer NN**

In [13]:
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Attempt to import CuPy
try:
    import cupy as cp
    print("CuPy imported successfully.")
    _has_cuda = False
    try:
        _has_cuda = cp.cuda.runtime.getDeviceCount() > 0
    except Exception:
        pass
    if _has_cuda:
        print("CUDA GPU is available and detected by CuPy.")
        array_lib = cp
        cp.cuda.Device(0).use()
    else:
        print("CUDA GPU not available. Falling back to NumPy (CPU).")
        array_lib = np
except ImportError:
    print("CuPy not installed. Falling back to NumPy (CPU).")
    array_lib = np

# Keep TensorFlow off GPU so it doesn't block CuPy
try:
    tf.config.set_visible_devices([], 'GPU')
except Exception:
    pass

# Load MNIST
(x_train_np, y_train_np), (x_test_np, y_test_np) = keras.datasets.mnist.load_data()
x_train_np = x_train_np.reshape(x_train_np.shape[0], -1).astype("float32") / 255.0
x_test_np = x_test_np.reshape(x_test_np.shape[0], -1).astype("float32") / 255.0

# One-hot encode
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]
y_train_one_hot_np = one_hot_encode(y_train_np, 10)
y_test_one_hot_np = one_hot_encode(y_test_np, 10)

# Transfer data
x_train = array_lib.asarray(x_train_np)
y_train_one_hot = array_lib.asarray(y_train_one_hot_np)
x_test = array_lib.asarray(x_test_np)
y_test = array_lib.asarray(y_test_np)

# Network params
input_size, hidden_size, output_size = 784, 128, 10
learning_rate, epochs, batch_size = 0.01, 10, 128

# Init weights
array_lib.random.seed(42)
W1 = array_lib.random.randn(input_size, hidden_size) * 0.01
b1 = array_lib.zeros((1, hidden_size))
W2 = array_lib.random.randn(hidden_size, output_size) * 0.01
b2 = array_lib.zeros((1, output_size))

# Activation funcs
def relu(x): return array_lib.maximum(0, x)
def relu_derivative(x): return (x > 0).astype(array_lib.float32)
def softmax(x):
    exp_x = array_lib.exp(x - array_lib.max(x, axis=1, keepdims=True))
    return exp_x / array_lib.sum(exp_x, axis=1, keepdims=True)

def cross_entropy(pred, target):
    pred = array_lib.clip(pred, 1e-12, 1. - 1e-12)
    return -array_lib.sum(target * array_lib.log(pred)) / pred.shape[0]

print(f"\nTraining from-scratch model on {'GPU' if array_lib.__name__=='cupy' else 'CPU'}...")
start = time.time()

for ep in range(epochs):
    idx = np.random.permutation(x_train.shape[0])
    x_shuf = x_train[array_lib.asarray(idx)]
    y_shuf = y_train_one_hot[array_lib.asarray(idx)]

    for i in range(0, x_train.shape[0], batch_size):
        xb, yb = x_shuf[i:i+batch_size], y_shuf[i:i+batch_size]
        z1 = array_lib.dot(xb, W1) + b1
        a1 = relu(z1)
        z2 = array_lib.dot(a1, W2) + b2
        a2 = softmax(z2)

        dz2 = a2 - yb
        dW2 = array_lib.dot(a1.T, dz2)
        db2 = array_lib.sum(dz2, axis=0, keepdims=True)
        da1 = array_lib.dot(dz2, W2.T)
        dz1 = da1 * relu_derivative(z1)
        dW1 = array_lib.dot(xb.T, dz1)
        db1 = array_lib.sum(dz1, axis=0, keepdims=True)

        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1

    if ep % 2 == 0:
        loss = cross_entropy(a2, yb)
        print(f"Epoch {ep+1}/{epochs}, Loss: {float(loss.get() if hasattr(loss,'get') else loss):.4f}")

gpu_training_time = time.time() - start
record_time("from_scratch_gpu", gpu_training_time)
print(f"Training time: {gpu_training_time:.2f}s")

# Test
z1 = array_lib.dot(x_test, W1) + b1
a1 = relu(z1)
z2 = array_lib.dot(a1, W2) + b2
a2 = softmax(z2)

pred = array_lib.argmax(a2, axis=1)
pred = pred.get() if hasattr(pred, "get") else pred
acc = np.mean(pred == y_test_np)
print(f"Test Accuracy: {acc:.4f}")


CuPy imported successfully.
CUDA GPU is available and detected by CuPy.

Training from-scratch model on GPU...
Epoch 1/10, Loss: 0.2479
Epoch 3/10, Loss: 0.1511
Epoch 5/10, Loss: 0.1115
Epoch 7/10, Loss: 0.0740
Epoch 9/10, Loss: 0.0240
Training time: 5.55s
Test Accuracy: 0.9686


In [20]:
gpu_t   = get_time("from_scratch_gpu")
numpy_t = get_time("numpy_from_scratch_cpu")
keras_t = get_time("keras_cpu")

print("\n--- Speed-Up Analysis (dynamic, measured) ---")
print(f"NumPy (CPU):   {fmt(numpy_t)} s")
print(f"Keras (CPU):   {fmt(keras_t)} s")
print(f"CuPy (GPU):    {fmt(gpu_t)} s")

if gpu_t:
    if numpy_t:
        print(f"Speed-up GPU vs NumPy CPU: {numpy_t / gpu_t:.2f}x")
    if keras_t:
        print(f"Speed-up GPU vs Keras CPU: {keras_t / gpu_t:.2f}x")
else:
    print("Run GPU cell first to record its timing.")


--- Speed-Up Analysis (dynamic, measured) ---
NumPy (CPU):   7.79 s
Keras (CPU):   25.12 s
CuPy (GPU):    5.55 s
Speed-up GPU vs NumPy CPU: 1.40x
Speed-up GPU vs Keras CPU: 4.53x
