In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
np.random.seed(42)

In [24]:
#dataset
X=np.random.uniform(-2, 2, (400, 3))
y=(np.sin(X[:,0])+0.5*(X[:,1]**2)-0.8*X[:,2])
y=y.reshape(-1,1)

X=X.T
y=y.T

In [25]:
def count_params(dims):
    total=0
    for i in range(1,len(dims)):
        p=dims[i]*dims[i-1]+dims[i]
        print(f"  Layer {i}: {dims[i]}x{dims[i-1]} + {dims[i]} = {p}")
        total+=p
    print(f"  Total: {total}\n")

models={
    "Model A - Shallow   [3,4,1]":[3,4,1],
    "Model B - Medium    [3,6,6,1]":[3,6,6,1],
    "Model C - Deep      [3,8,8,8,8,1]":[3,8,8,8,8,1],
    "Model D - VeryDeep  [3,8x8,1]":[3,8,8,8,8,8,8,8,8,1],
}

for name,dims in models.items():
    print(name)
    count_params(dims)

Model A - Shallow   [3,4,1]
  Layer 1: 4x3 + 4 = 16
  Layer 2: 1x4 + 1 = 5
  Total: 21

Model B - Medium    [3,6,6,1]
  Layer 1: 6x3 + 6 = 24
  Layer 2: 6x6 + 6 = 42
  Layer 3: 1x6 + 1 = 7
  Total: 73

Model C - Deep      [3,8,8,8,8,1]
  Layer 1: 8x3 + 8 = 32
  Layer 2: 8x8 + 8 = 72
  Layer 3: 8x8 + 8 = 72
  Layer 4: 8x8 + 8 = 72
  Layer 5: 1x8 + 1 = 9
  Total: 257

Model D - VeryDeep  [3,8x8,1]
  Layer 1: 8x3 + 8 = 32
  Layer 2: 8x8 + 8 = 72
  Layer 3: 8x8 + 8 = 72
  Layer 4: 8x8 + 8 = 72
  Layer 5: 8x8 + 8 = 72
  Layer 6: 8x8 + 8 = 72
  Layer 7: 8x8 + 8 = 72
  Layer 8: 8x8 + 8 = 72
  Layer 9: 1x8 + 1 = 9
  Total: 545



In [27]:
def relu(z):        
    return np.maximum(0, z)

def relu_d(z):      
    return (z > 0).astype(float)

def sigmoid(z):     
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def sigmoid_d(z):
    s = sigmoid(z)
    return s * (1 - s)

def tanh_fn(z):    
    return np.tanh(z)

def tanh_d(z):      
    return 1 - np.tanh(z) ** 2

ALPHA = 0.01   
def leaky_relu(z):   
    return np.where(z > 0, z, ALPHA * z)

def leaky_relu_d(z): 
    return np.where(z > 0, 1.0, ALPHA)

def softplus(z):    
    return np.log1p(np.exp(np.clip(z, -500, 500)))

def softplus_d(z):  
    return sigmoid(z)   

In [28]:
ACTS={
    "relu":(relu,relu_d),
    "sigmoid":(sigmoid,sigmoid_d),
    "tanh":(tanh_fn,tanh_d),
    "leaky_relu":(leaky_relu,leaky_relu_d),
    "softplus":(softplus,softplus_d),
}

In [29]:
def init(dims):
    params=[]
    for i in range(1,len(dims)):
        W=np.random.uniform(-0.5,0.5,(dims[i],dims[i-1]))
        b=np.zeros((dims[i],1))
        params.append((W,b))
    return params

In [30]:
def forward(X,params,act):
    cache=[]
    A=X
    for i,(W,b) in enumerate(params):
        Z=W@A+b
        A_prev=A
        if i==len(params)-1:
            A=Z
        else:
            A=act(Z)
        cache.append((A_prev,Z))
    return A,cache

In [31]:
def mse(pred,true):
    return np.mean((pred-true)**2)

def mse_grad(pred,true):
    N=true.shape[1]
    return (2/N)*(pred-true)

In [32]:
def backward(dA,params,cache,act_d):
    grads=[None]*len(params)
    for i in reversed(range(len(params))):
        W,b=params[i]
        A_prev,Z=cache[i]
        N=A_prev.shape[1]
        if i==len(params)-1:
            dZ=dA
        else:
            dZ=dA*act_d(Z)
        dW=(1/N)*(dZ@A_prev.T)
        db=(1/N)*np.sum(dZ,axis=1,keepdims=True)
        dA=W.T@dZ
        grads[i]=(dW,db)
    return grads

In [33]:
def update(params,grads,lr):
    return[(W-lr*dW,b-lr*db)for (W,b),(dW,db) in zip(params,grads)]

def grad_norm(dW):
    return np.sqrt(np.sum(dW**2))

In [34]:
def train(dims,act_name="relu",lr=0.01,epochs=1000,verbose=True):
    act,act_d=ACTS[act_name]
    params=init(dims)
    history=[]
    for ep in range(1,epochs+1):
        pred,cache=forward(X,params,act)
        loss=mse(pred,y)
        history.append(loss)
        grads=backward(mse_grad(pred,y),params,cache,act_d)
        params=update(params,grads,lr)
        if verbose and (ep%200==0 or ep==1):
            print(f"  Epoch {ep:4d} | Loss: {loss:.5f}")
    pred,cache=forward(X,params,act)
    grads=backward(mse_grad(pred,y),params,cache,act_d)
    n_hidden=len(params)-1
    return{
        "history":history,
        "final":history[-1],
        "ep200":history[199],
        "gn_first":grad_norm(grads[0][0]),
        "gn_last":grad_norm(grads[n_hidden-1][0]),
    }

In [35]:
configs=[
    ("Model A - Shallow",[3,4,1],"relu"),
    ("Model B - Medium",[3,6,6,1],"relu"),
    ("Model C - Deep 4h",[3,8,8,8,8,1],"relu"),
    ("Model D - Deep 8h",[3,8,8,8,8,8,8,8,8,1],"relu"),
    ("Model D - Deep 8h",[3,8,8,8,8,8,8,8,8,1],"sigmoid"),
]

table=[]
for name,dims,act in configs:
    print(f"\n>> {name} | {act}")
    r=train(dims,act)
    table.append((name,act,r["final"],r["ep200"],r["gn_first"],r["gn_last"]))

print(f"{'Model':<22} {'Act':<10} {'FinalLoss':>10} {'Loss@200':>10} {'GradNorm_L1':>13} {'GradNorm_Last':>14}")
for row in table:
    print(f"{row[0]:<22} {row[1]:<10} {row[2]:>10.5f} {row[3]:>10.5f} {row[4]:>13.6f} {row[5]:>14.6f}")


>> Model A - Shallow | relu
  Epoch    1 | Loss: 1.87697
  Epoch  200 | Loss: 1.86785
  Epoch  400 | Loss: 1.85874
  Epoch  600 | Loss: 1.84970
  Epoch  800 | Loss: 1.84074
  Epoch 1000 | Loss: 1.83184

>> Model B - Medium | relu
  Epoch    1 | Loss: 2.07834
  Epoch  200 | Loss: 2.06590
  Epoch  400 | Loss: 2.05373
  Epoch  600 | Loss: 2.04188
  Epoch  800 | Loss: 2.03035
  Epoch 1000 | Loss: 2.01911

>> Model C - Deep 4h | relu
  Epoch    1 | Loss: 2.17555
  Epoch  200 | Loss: 2.16488
  Epoch  400 | Loss: 2.15443
  Epoch  600 | Loss: 2.14426
  Epoch  800 | Loss: 2.13436
  Epoch 1000 | Loss: 2.12473

>> Model D - Deep 8h | relu
  Epoch    1 | Loss: 2.16085
  Epoch  200 | Loss: 2.14992
  Epoch  400 | Loss: 2.13951
  Epoch  600 | Loss: 2.12959
  Epoch  800 | Loss: 2.11996
  Epoch 1000 | Loss: 2.11067

>> Model D - Deep 8h | sigmoid
  Epoch    1 | Loss: 2.37204
  Epoch  200 | Loss: 2.33711
  Epoch  400 | Loss: 2.30408
  Epoch  600 | Loss: 2.27301
  Epoch  800 | Loss: 2.24377
  Epoch 1000

Q1. Did deeper networks always reduce the loss faster?
ans : not always. model A acheived the lowest final loss , as the depth of the model increases the final loss actually became worse for this case.

Q2.Did gradients stay similar across layers?
ans : no , as the network became deeper the gradient norm in the 1st layer decreased significantly -> vanishing gradient problem 

Q3. Was training equally stable for all activations?
ans: no the deep model with sigmoid had the highest loss which indicates there is unstability and slow learning as compared to Relu

Q4. Which activation was more stable in deep networks?
ans : in this case Relu was more stable as compared to Sigmoid 

Q5. Did some models improve very slowly despite using the same learning rate?
ans : yes deep 8 layer model with sigmoid activation improved slowly .