In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import tensorflow as tf

In [2]:
mobile_train = pd.read_csv('mobile_train.csv')
mobile_test = pd.read_csv('mobile_test.csv')
mobile_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
mobile_train.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [4]:
mobile_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

Drawing parallels from the previous notebook we will be removing the correlated input variables using VIF

In [5]:
mobile_train_vif = mobile_train.drop(['price_range'], axis=1)

def calculate_vif(data_frame):
    features = data_frame.columns
    vif_data = pd.DataFrame()
    vif_data["Feature"] = features
    vif_data["VIF"] = [variance_inflation_factor(data_frame.values, i) for i in range(data_frame.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)
    
def drop_high_vif_features(data_frame, threshold=5):
    while True:
        vif_results = calculate_vif(data_frame)
        max_vif_feature = vif_results.loc[vif_results['VIF'].idxmax(), 'Feature']
        max_vif_value = vif_results.loc[vif_results['VIF'].idxmax(), 'VIF']
        
        if max_vif_value > threshold:
            print(f"Dropping feature '{max_vif_feature}' with VIF {max_vif_value}")
            data_frame = data_frame.drop(columns=max_vif_feature)
        else:
            break
    return data_frame
mobile_train_vif = drop_high_vif_features(mobile_train_vif)

Dropping feature 'mobile_wt' with VIF 12.972548425819065
Dropping feature 'px_width' with VIF 11.470014131904488
Dropping feature 'sc_h' with VIF 11.086593845458365
Dropping feature 'battery_power' with VIF 7.543843177190293
Dropping feature 'pc' with VIF 6.050059878559392
Dropping feature 'three_g' with VIF 5.930418164840767


In [6]:
X_vif = mobile_train_vif
y_vif = mobile_train['price_range']
X_train_vif, X_test_vif, y_train_vif, y_test_vif = train_test_split(X_vif, y_vif, test_size=0.2, random_state=42)
## now lets standardize the input data
scaler = StandardScaler()
X_train_vif_scaled = scaler.fit_transform(X_train_vif)
X_test_vif_scaled = scaler.transform(X_test_vif)

### Logistic Regression

We will predict results using a base case logistic regression model from sk learn library and use it for benchmarking as we move forward

In [7]:
model_vif = LogisticRegression()
model_vif.fit(X_train_vif_scaled, y_train_vif)

# Make predictions
y_pred_vif = model_vif.predict(X_test_vif_scaled)

# Check accuracy of model
accuracy_lr = accuracy_score(y_test_vif, y_pred_vif)
conf_matrix_lr = confusion_matrix(y_test_vif, y_pred_vif)
classification_rep_lr = classification_report(y_test_vif, y_pred_vif)

print(f"Accuracy: {accuracy_lr}")
print(f"Confusion Matrix:\n{conf_matrix_lr}")
print(f"Classification Report:\n{classification_rep_lr}")

Accuracy: 0.79
Confusion Matrix:
[[93 12  0  0]
 [12 61 18  0]
 [ 0 10 68 14]
 [ 0  0 18 94]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       105
           1       0.73      0.67      0.70        91
           2       0.65      0.74      0.69        92
           3       0.87      0.84      0.85       112

    accuracy                           0.79       400
   macro avg       0.79      0.78      0.78       400
weighted avg       0.79      0.79      0.79       400



### Logistic Regression - Neural Network implementation

In [8]:
# transforming the data into dimensions (n,m) where m denoted the number of examples for ease of computation
X_train_nn, X_test_nn = X_train_vif_scaled.T, X_test_vif_scaled.T
y_train_nn,y_test_nn = y_train_vif.to_numpy().reshape((1,1600)), y_test_vif.to_numpy().reshape((1,400))

In [30]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # for numerical stability
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def initialize_parameters(n, num_classes):
    w = np.random.randn(n, num_classes) * 0.01
    b = np.zeros((num_classes, 1))
    return w, b

def propagate(w, b, X, Y):
    m = X.shape[1]
    
    # Forward propagation
    Z = np.dot(w.T, X) + b
    A = softmax(Z)
    cost = -1/m * np.sum(np.log(A[Y, np.arange(m)]))
    
    # Backward propagation
    dz = A.copy()
    dz[Y, np.arange(m)] -= 1
    dw = 1/m * np.dot(X, dz.T)
    db = 1/m * np.sum(dz, axis=1, keepdims=True)
    
    grads = {"dw": dw, "db": db}
    
    return grads, cost

def optimize(w, b, X, Y, num_iterations, learning_rate):
    for i in range(num_iterations):
        grads, cost = propagate(w, b, X, Y)
        
        # Update parameters
        w -= learning_rate * grads["dw"]
        b -= learning_rate * grads["db"]

        if i%200==0:
            print(f"cost after {i} iterations: {cost}")
    return w, b

def predict(w, b, X):
    Z = np.dot(w.T, X) + b
    A = softmax(Z)
    predictions = np.argmax(A, axis=0)
    return predictions

# we will try to encompass all of the above into one function called lr_nn_model

def lr_nn_model(X_train, Y_train, num_classes, num_iterations=1000, learning_rate=0.01):
    w, b = initialize_parameters(X_train.shape[0], num_classes)
    w, b = optimize(w, b, X_train, Y_train, num_iterations, learning_rate)
    return w, b
    
def accuracy(predictions, actual_labels):
    correct_predictions = np.sum(predictions == actual_labels)
    total_examples = len(actual_labels[0])
    acc = correct_predictions / total_examples
    return acc

In [31]:
w, b = lr_nn_model(X_train_nn,y_train_nn, 4)
train_predictions = predict(w,b,X_train_nn)
accuracy_scores = accuracy(train_predictions, y_train_nn)

test_predictions = predict(w, b, X_test_nn)
accuracy_scores_test = accuracy(test_predictions, y_test_nn)

cm1 = confusion_matrix(np.squeeze(y_train_nn), np.squeeze(train_predictions))
class_metrics1 = classification_report(np.squeeze(y_train_nn), np.squeeze(train_predictions))
cm = confusion_matrix(np.squeeze(y_test_nn), np.squeeze(test_predictions))
class_metrics = classification_report(np.squeeze(y_test_nn), np.squeeze(test_predictions))

print(f"Accuracy Train: {accuracy_scores}")
print("Confusion Matrix Train:")
print(cm1)

print("\nClassification Report Train:")
print(class_metrics1)

print(f"Accuracy: {accuracy_scores_test}")
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(class_metrics)

cost after 0 iterations: 1.3855577475207161
cost after 200 iterations: 1.103842012113078
cost after 400 iterations: 0.9799947344720613
cost after 600 iterations: 0.9092942253407958
cost after 800 iterations: 0.8612238743688948
Accuracy Train: 0.69625
Confusion Matrix Train:
[[384  11   0   0]
 [161 183  57   8]
 [  5  64 172 167]
 [  0   0  13 375]]

Classification Report Train:
              precision    recall  f1-score   support

           0       0.70      0.97      0.81       395
           1       0.71      0.45      0.55       409
           2       0.71      0.42      0.53       408
           3       0.68      0.97      0.80       388

    accuracy                           0.70      1600
   macro avg       0.70      0.70      0.67      1600
weighted avg       0.70      0.70      0.67      1600

Accuracy: 0.7225
Confusion Matrix:
[[105   0   0   0]
 [ 33  40  17   1]
 [  0  10  39  43]
 [  0   0   7 105]]

Classification Report:
              precision    recall  f1-score   s

One hygiene check to see if the implementation of the neural network is correct, is to see for a reducing cost with successive iterations, which is evident in the above case

### Neural Network with 1 Hidden layer

Now lets build further on the neural network implementation of logistic regression and add one hidden layer to the neural network

In [38]:
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # for numerical stability
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def initialize_parameters(n, n_hidden, num_classes):
    w1 = np.random.randn(n_hidden, n) * np.sqrt(2 / n)  # Xavier initialization
    b1 = np.zeros((n_hidden, 1))
    w2 = np.random.randn(num_classes, n_hidden) * np.sqrt(2 / n_hidden)  # Xavier initialization
    b2 = np.zeros((num_classes, 1))
    return w1, b1, w2, b2

def forward_propagation(w1, b1, w2, b2, X, Y):
    Z1 = np.dot(w1, X) + b1
    A1 = relu(Z1)
    Z2 = np.dot(w2, A1) + b2
    A2 = softmax(Z2)
    
    # compute the cost
    m = X.shape[1]
    num_classes = 4  ## remember to change this as number of classes changes
    Y_one_hot = np.eye(num_classes)[Y].T
    Y_one_hot = Y_one_hot.reshape(A2.shape) # Ensure shapes are compatible
    # avoid numerical instability
    epsilon = 1e-15
    A2 = np.maximum(epsilon, A2)
    cost = -np.sum(Y_one_hot * np.log(A2)) / m
    
    return A1, A2, cost

def backward_propagation(A1, A2, w2, X, Y):
    m = X.shape[1]
    
    # dz2 = A2 - Y
    dz2 = A2.copy()
    dz2[Y, np.arange(m)] -= 1
    dw2 = 1/m * np.dot(dz2, A1.T)
    db2 = 1/m * np.sum(dz2, axis=1, keepdims=True)
    
    dz1 = np.dot(w2.T, dz2) * relu_derivative(A1)
    dw1 = 1/m * np.dot(dz1, X.T)
    db1 = 1/m * np.sum(dz1, axis=1, keepdims=True)
    
    grads = {"dw1": dw1, "db1": db1, "dw2": dw2, "db2": db2}
    
    return grads

def update_parameters(w1, b1, w2, b2, grads, learning_rate):
    w1 -= learning_rate * grads["dw1"]
    b1 -= learning_rate * grads["db1"]
    w2 -= learning_rate * grads["dw2"]
    b2 -= learning_rate * grads["db2"]
    return w1, b1, w2, b2

def predict(w1, b1, w2, b2, X, Y):
    _, A2, _ = forward_propagation(w1, b1, w2, b2, X,Y)
    predictions = np.argmax(A2, axis=0)
    return predictions

def nn_1_layer_model(X_train, Y_train, X_test, Y_test, num_classes, n_hidden, num_iterations=1000, learning_rate=0.01):
    # Initialize parameters
    w1, b1, w2, b2 = initialize_parameters(X_train.shape[0], n_hidden, num_classes)

    # Train the neural network
    for i in range(num_iterations):
        A1, A2, cost = forward_propagation(w1, b1, w2, b2, X_train, Y_train)
        grads = backward_propagation(A1, A2, w2, X_train, Y_train)
        w1, b1, w2, b2 = update_parameters(w1, b1, w2, b2, grads, learning_rate)

        if i%1000==0:
            print(f"cost after {i} iterations: {cost}")

    # Make predictions on the training set
    train_predictions = predict(w1, b1, w2, b2, X_train, Y_train)

    # Make predictions on the test set
    test_predictions = predict(w1, b1, w2, b2, X_test, Y_test)

    # Evaluate the model on the training set
    print("Training Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_train), np.squeeze(train_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_train), np.squeeze(train_predictions)))

    # Evaluate the model on the test set
    print("\nTest Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_test), np.squeeze(test_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_test), np.squeeze(test_predictions)))

In [40]:
nn_1_layer_model(X_train_nn, y_train_nn, X_test_nn, y_test_nn, num_classes=4, n_hidden=4, num_iterations=10000, learning_rate=0.009)

cost after 0 iterations: 2.5509984584410486
cost after 1000 iterations: 0.9110399802496653
cost after 2000 iterations: 0.7973673418248804
cost after 3000 iterations: 0.7007531085186158
cost after 4000 iterations: 0.5868493321850465
cost after 5000 iterations: 0.5228581663845593
cost after 6000 iterations: 0.48994998273375984
cost after 7000 iterations: 0.4717622326998417
cost after 8000 iterations: 0.46170851220685166
cost after 9000 iterations: 0.4559979890973614
Training Set Evaluation:
Confusion Matrix:
[[347  48   0   0]
 [ 49 289  71   0]
 [  0  48 305  55]
 [  0   0  48 340]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       395
           1       0.75      0.71      0.73       409
           2       0.72      0.75      0.73       408
           3       0.86      0.88      0.87       388

    accuracy                           0.80      1600
   macro avg       0.80      0.80      0.80      1600
weighte

Although we get reasonalbly good results in test set, our model isnt able to fit the training set well enough. so lets focus on that and get it better by running a deeper neural networ model

### Deep Neural Network

In [42]:
layer_dims = [14,3,4,4]
initialize_parameters_deep(layer_dims)

{'W1': array([[ 0.67603802,  0.16498522,  0.03647261, -0.70433404, -0.10484289,
         -0.13408629, -0.03127334, -0.23698398, -0.01656171, -0.18037146,
         -0.4965942 ,  0.33435583,  0.33310691,  0.64615788],
        [ 0.01891094, -0.15295369, -0.20612669, -0.58451348,  0.37129999,
         -0.41616445, -0.44790549, -0.07772836,  0.56171128,  0.08947034,
         -0.38695441, -0.2694861 ,  0.23632038, -0.06066835],
        [-0.29059283, -0.08694344,  0.2816048 ,  0.74689967, -0.47023442,
         -0.23676334, -0.30379503, -0.9143275 , -0.34916056, -0.38698866,
          0.42482374, -0.04985889, -0.61354423,  0.24442035]]),
 'b1': array([[0.],
        [0.],
        [0.]]),
 'W2': array([[-0.29089386, -1.4232687 , -0.48716239],
        [-0.4805853 , -0.71352191,  0.02426123],
        [-1.83569478, -0.21862665,  0.82726082],
        [ 0.69630652,  0.9048313 ,  0.91397864]]),
 'b2': array([[0.],
        [0.],
        [0.],
        [0.]]),
 'W3': array([[ 1.05185184, -0.790758  ,  0.

In [43]:
np.random.seed(3)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # for numerical stability
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def initialize_parameters_deep(layer_dims):
    """
    Input is of length L layers including the input layer, but the input layer 
    doesnt have any parameters so the length of the parameters dict 
    will be 2(l-1) - 2 parameters (w,b) for each of the l-1 layers (excluding input layer)
    """
    parameters = {}
    L = len(layer_dims) # 

    for l in range(1, L):
        parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l-1])
        parameters[f'b{l}'] = np.zeros((layer_dims[l], 1))

    return parameters

def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

def activation_forward(Z, activation):
    if activation == "relu":
        A = relu(Z)
    elif activation == "softmax":
        A = softmax(Z)

    cache = Z
    return A, cache

def forward_propagation_deep(X, parameters, activations):
    caches = []
    A = X
    L = len(parameters) // 2

    for l in range(1, L):
        A_prev = A
        Z, linear_cache = linear_forward(A_prev, parameters[f'W{l}'], parameters[f'b{l}'])
        A, activation_cache = activation_forward(Z, "relu")
        cache = (linear_cache, activation_cache)
        caches.append(cache)

    # Last layer (softmax activation)
    ZL, linear_cache = linear_forward(A, parameters[f'W{L}'], parameters[f'b{L}'])
    AL, activation_cache = activation_forward(ZL, "softmax")
    cache = (linear_cache, activation_cache)
    caches.append(cache)

    return AL, caches

def compute_cost(AL, Y):
    m = Y.shape[1]
    epsilon = 1e-8
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze() # since the output layer is of dimension (4,m) we try to make y also same dim
    cost = -1/m * np.sum(Y_one_hot * np.log(AL+epsilon))
    return cost

def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

def activation_backward(dA, cache, activation):
    Z = cache
    if activation == "relu":
        dZ = dA * relu_derivative(Z)
    elif activation == "softmax":
        dZ = dA

    return dZ

def backward_propagation_deep(AL, Y, caches, activations):
    grads = {}
    L = len(caches)

    # Convert Y to one-hot encoded matrix
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()

    # Compute gradient of the cost with respect to AL for softmax activation
    dAL = AL - Y_one_hot
    grads['dAL'] = dAL
    
    # Last layer (softmax activation)
    current_cache = caches[L-1]
    linear_cache, activation_cache = current_cache
    dZL = activation_backward(dAL, activation_cache, "softmax")
    grads[f'dA{L-1}'], grads[f'dW{L}'], grads[f'db{L}'] = linear_backward(dZL, linear_cache)

    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        linear_cache, activation_cache = current_cache
        dZ = activation_backward(grads[f'dA{l+1}'], activation_cache, "relu")
        grads[f'dA{l}'], grads[f'dW{l+1}'], grads[f'db{l+1}'] = linear_backward(dZ, linear_cache)

    return grads

def update_parameters_deep(parameters, grads, learning_rate):
    L = len(parameters) // 2

    for l in range(1, L+1):
        parameters[f'W{l}'] -= learning_rate * grads[f'dW{l}']
        parameters[f'b{l}'] -= learning_rate * grads[f'db{l}']

    return parameters

def predict_deep(parameters, X, activations):
    AL, _ = forward_propagation_deep(X, parameters, activations)
    predictions = np.argmax(AL, axis=0)
    return predictions

def nn_deep_model(X_train, Y_train, X_test, Y_test, layer_dims, num_iterations=1000, learning_rate=0.01):
    activations = ["relu"] * (len(layer_dims) - 2) + ["softmax"]
    
    # Initialize parameters
    parameters = initialize_parameters_deep(layer_dims)

    # Train the neural network
    for i in range(num_iterations):
        AL, caches = forward_propagation_deep(X_train, parameters, activations)
        cost = compute_cost(AL, Y_train)
        grads = backward_propagation_deep(AL, Y_train, caches, activations)
        parameters = update_parameters_deep(parameters, grads, learning_rate)

        if i % 1000 == 0:
            print(f'Cost after iteration {i}: {cost}')
            # print(f'Mean activation: {np.mean(AL)}, Std activation: {np.std(AL)}')
            # print(f'Mean dAL: {np.mean(grads["dAL"])}, Std dAL: {np.std(grads["dAL"])}')

    # Make predictions on the training set
    train_predictions = predict_deep(parameters, X_train, activations)

    # Make predictions on the test set
    test_predictions = predict_deep(parameters, X_test, activations)

    # Evaluate the model on the training set
    print("Training Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_train), np.squeeze(train_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_train), np.squeeze(train_predictions)))

    # Evaluate the model on the test set
    print("\nTest Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_test), np.squeeze(test_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_test), np.squeeze(test_predictions)))

In [44]:
np.random.seed(3)
layer_dims = [14, 4, 3, 4, 4]
nn_deep_model(X_train_nn, y_train_nn, X_test_nn, y_test_nn, layer_dims, num_iterations=6000, learning_rate=0.009)

Cost after iteration 0: 3.162721785015146
Cost after iteration 1000: 1.3720866117383517
Cost after iteration 2000: 1.241635525835633
Cost after iteration 3000: 0.88636428814306
Cost after iteration 4000: 0.7688856928124054
Cost after iteration 5000: 0.6697639782150998
Training Set Evaluation:
Confusion Matrix:
[[352  43   0   0]
 [ 55 297  57   0]
 [  0  68 274  66]
 [  0   0  37 351]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.88       395
           1       0.73      0.73      0.73       409
           2       0.74      0.67      0.71       408
           3       0.84      0.90      0.87       388

    accuracy                           0.80      1600
   macro avg       0.79      0.80      0.80      1600
weighted avg       0.79      0.80      0.79      1600


Test Set Evaluation:
Confusion Matrix:
[[94 11  0  0]
 [14 64 13  0]
 [ 0 11 66 15]
 [ 0  0 16 96]]

Classification Report:
              precision    

The Deep neural network model seems to performing considerably better than the initial 2 layer NN

This is evident from the improvement in training performance, but even then it has a pretty high bias. If we consider max achievable performance is 100% (which most probably might not be the case in most scenarios and the human level performance will be lesser than 100% but due to lack of conclusive data about human level performance we will assume that it is 100%), then the deep NN model still has quite some scope for improvement.

Let's add Batch normalization to the activations of each layer and then feed it forward to the next layer, this helps in making the gradient descent more faster and computationally simpler

### Batch Normalization

In [45]:
np.random.seed(3)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def batch_normalize(Z):
    mean = np.mean(Z, axis=1, keepdims=True)
    var = np.var(Z, axis=1, keepdims=True)
    Z_normalized = (Z - mean) / np.sqrt(var + 1e-8)
    return Z_normalized, mean, var

def initialize_parameters_deep(layer_dims):
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l-1])
        parameters[f'b{l}'] = np.zeros((layer_dims[l], 1))
        parameters[f'gamma{l}'] = np.ones((layer_dims[l], 1))
        parameters[f'beta{l}'] = np.zeros((layer_dims[l], 1))

    return parameters

def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

def batch_normalize_forward(Z, gamma, beta):
    Z_normalized, mean, var = batch_normalize(Z)
    Z_tilde = gamma * Z_normalized + beta
    cache = (Z, Z_normalized, mean, var, gamma, beta)
    return Z_tilde, cache

def activation_forward(Z, activation):
    if activation == "relu":
        A = relu(Z)
    elif activation == "softmax":
        A = softmax(Z)

    cache = Z
    return A, cache

def forward_propagation_deep(X, parameters, activations):
    caches = []
    A = X
    L = len(parameters) // 4  # Considering gamma and beta for each layer

    for l in range(1, L):
        A_prev = A
        Z, linear_cache = linear_forward(A_prev, parameters[f'W{l}'], parameters[f'b{l}'])
        Z_tilde, batch_cache = batch_normalize_forward(Z, parameters[f'gamma{l}'], parameters[f'beta{l}'])
        A, activation_cache = activation_forward(Z_tilde, "relu")
        cache = (linear_cache, batch_cache, activation_cache)
        caches.append(cache)

    # Last layer (softmax activation)
    ZL, linear_cache = linear_forward(A, parameters[f'W{L}'], parameters[f'b{L}'])
    ZL_tilde, batch_cache = batch_normalize_forward(ZL, parameters[f'gamma{L}'], parameters[f'beta{L}'])
    AL, activation_cache = activation_forward(ZL_tilde, "softmax")
    cache = (linear_cache, batch_cache, activation_cache)
    caches.append(cache)

    return AL, caches

def compute_cost(AL, Y):
    m = Y.shape[1]
    epsilon = 1e-8
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()
    cost = -1/m * np.sum(Y_one_hot * np.log(AL+epsilon))
    return cost

def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

def batch_normalize_backward(dZ_tilde, cache):
    Z, Z_normalized, mean, var, gamma, beta = cache
    m = Z.shape[1]

    dZ_normalized = dZ_tilde * gamma
    dVar = np.sum(dZ_normalized * (Z - mean), axis=1, keepdims=True) * -0.5 * (var + 1e-8)**(-1.5)
    dMean = np.sum(dZ_normalized, axis=1, keepdims=True) * -1 / np.sqrt(var + 1e-8)
    
    dZ = (dZ_normalized / np.sqrt(var + 1e-8)) + (dVar * 2 * (Z - mean) / m) + (dMean / m)
    dGamma = np.sum(dZ_tilde * Z_normalized, axis=1, keepdims=True)
    dBeta = np.sum(dZ_tilde, axis=1, keepdims=True)

    return dZ, dGamma, dBeta

def activation_backward(dA, cache, activation):
    Z = cache
    if activation == "relu":
        dZ = dA * relu_derivative(Z)
    elif activation == "softmax":
        dZ = dA

    return dZ

def backward_propagation_deep(AL, Y, caches, activations):
    grads = {}
    L = len(caches)

    # Convert Y to one-hot encoded matrix
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()

    # Compute gradient of the cost with respect to AL for softmax activation
    dAL = AL - Y_one_hot
    grads['dAL'] = dAL
    
    # Last layer (softmax activation)
    current_cache = caches[L-1]
    linear_cache, batch_cache, activation_cache = current_cache
    dZL_tilde = activation_backward(dAL, activation_cache, "softmax")
    dZL, dGammaL, dBetaL = batch_normalize_backward(dZL_tilde, batch_cache)
    dA_prev, dW, db = linear_backward(dZL, linear_cache)
    grads[f'dA{L-1}'], grads[f'dW{L}'], grads[f'db{L}'] = dA_prev, dW, db
    grads[f'dGamma{L}'], grads[f'dBeta{L}'] = dGammaL, dBetaL

    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        linear_cache, batch_cache, activation_cache = current_cache
        dZ_tilde = activation_backward(grads[f'dA{l+1}'], activation_cache, "relu")
        dZ, dGamma, dBeta = batch_normalize_backward(dZ_tilde, batch_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        grads[f'dA{l}'], grads[f'dW{l+1}'], grads[f'db{l+1}'] = dA_prev, dW, db
        grads[f'dGamma{l+1}'], grads[f'dBeta{l+1}'] = dGamma, dBeta

    return grads


def update_parameters_deep(parameters, grads, learning_rate):
    L = len(parameters) // 4

    for l in range(1, L+1):
        parameters[f'W{l}'] -= learning_rate * grads[f'dW{l}']
        parameters[f'b{l}'] -= learning_rate * grads[f'db{l}']
        parameters[f'gamma{l}'] -= learning_rate * grads[f'dGamma{l}']
        parameters[f'beta{l}'] -= learning_rate * grads[f'dBeta{l}']

    return parameters

def predict_deep(parameters, X, activations):
    AL, _ = forward_propagation_deep(X, parameters, activations)
    predictions = np.argmax(AL, axis=0)
    return predictions

def nn_deep_model(X_train, Y_train, X_test, Y_test, layer_dims, num_iterations=1000, learning_rate=0.01):
    activations = ["relu"] * (len(layer_dims) - 2) + ["softmax"]
    
    # Initialize parameters
    parameters = initialize_parameters_deep(layer_dims)

    # Train the neural network
    for i in range(num_iterations):
        AL, caches = forward_propagation_deep(X_train, parameters, activations)
        cost = compute_cost(AL, Y_train)
        grads = backward_propagation_deep(AL, Y_train, caches, activations)
        parameters = update_parameters_deep(parameters, grads, learning_rate)

        if i % 1000 == 0:
            print(f'Cost after iteration {i}: {cost}')
            # print(f'Mean activation: {np.mean(AL)}, Std activation: {np.std(AL)}')
            # print(f'Mean dAL: {np.mean(grads["dAL"])}, Std dAL: {np.std(grads["dAL"])}')

    # Make predictions on the training set
    train_predictions = predict_deep(parameters, X_train, activations)

    # Make predictions on the test set
    test_predictions = predict_deep(parameters, X_test, activations)

    # Evaluate the model on the training set
    print("Training Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_train), np.squeeze(train_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_train), np.squeeze(train_predictions)))

    # Evaluate the model on the test set
    print("\nTest Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_test), np.squeeze(test_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_test), np.squeeze(test_predictions)))

In [48]:
np.random.seed(3)
layer_dims = [X_train_nn.shape[0], 10, 8, 6, 4] 

# Train the neural network with batch normalization
nn_deep_model(X_train_nn, y_train_nn, X_test_nn, y_test_nn, layer_dims, num_iterations=9000, learning_rate=0.01)

Cost after iteration 0: 1.5282857458436325
Cost after iteration 1000: 0.6811081076799039
Cost after iteration 2000: 0.6286349260529619
Cost after iteration 3000: 0.5888168209792406
Cost after iteration 4000: 0.5724499696296601
Cost after iteration 5000: 0.5696689153123728
Cost after iteration 6000: 0.5551894111320845
Cost after iteration 7000: 0.556409692882914
Cost after iteration 8000: 0.54785582566557
Training Set Evaluation:
Confusion Matrix:
[[311  84   0   0]
 [ 16 378  15   0]
 [  0 133 176  99]
 [  0   1  30 357]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.79      0.86       395
           1       0.63      0.92      0.75       409
           2       0.80      0.43      0.56       408
           3       0.78      0.92      0.85       388

    accuracy                           0.76      1600
   macro avg       0.79      0.77      0.75      1600
weighted avg       0.79      0.76      0.75      1600


Test Set Eva

Adding batch normalization has helped the model train faster and computationally more efficient compared to before. However, there is still the issue of high bias in the model. 

So we will try to reduce the bias by trying out a different optimization technique - ADAM

### ADAM Optimization

In [51]:
np.random.seed(3)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def batch_normalize(Z):
    mean = np.mean(Z, axis=1, keepdims=True)
    var = np.var(Z, axis=1, keepdims=True)
    Z_normalized = (Z - mean) / np.sqrt(var + 1e-8)
    return Z_normalized, mean, var

def initialize_parameters_deep(layer_dims):
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l-1])
        parameters[f'b{l}'] = np.zeros((layer_dims[l], 1))
        parameters[f'gamma{l}'] = np.ones((layer_dims[l], 1))
        parameters[f'beta{l}'] = np.zeros((layer_dims[l], 1))

    return parameters

def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

def batch_normalize_forward(Z, gamma, beta):
    Z_normalized, mean, var = batch_normalize(Z)
    Z_tilde = gamma * Z_normalized + beta
    cache = (Z, Z_normalized, mean, var, gamma, beta)
    return Z_tilde, cache

def activation_forward(Z, activation):
    if activation == "relu":
        A = relu(Z)
    elif activation == "softmax":
        A = softmax(Z)

    cache = Z
    return A, cache

def forward_propagation_deep(X, parameters, activations):
    caches = []
    A = X
    L = len(parameters) // 4  # Considering gamma and beta for each layer

    for l in range(1, L):
        A_prev = A
        Z, linear_cache = linear_forward(A_prev, parameters[f'W{l}'], parameters[f'b{l}'])
        Z_tilde, batch_cache = batch_normalize_forward(Z, parameters[f'gamma{l}'], parameters[f'beta{l}'])
        A, activation_cache = activation_forward(Z_tilde, "relu")
        cache = (linear_cache, batch_cache, activation_cache)
        caches.append(cache)

    # Last layer (softmax activation)
    ZL, linear_cache = linear_forward(A, parameters[f'W{L}'], parameters[f'b{L}'])
    ZL_tilde, batch_cache = batch_normalize_forward(ZL, parameters[f'gamma{L}'], parameters[f'beta{L}'])
    AL, activation_cache = activation_forward(ZL_tilde, "softmax")
    cache = (linear_cache, batch_cache, activation_cache)
    caches.append(cache)

    return AL, caches

def compute_cost(AL, Y):
    m = Y.shape[1]
    epsilon = 1e-8
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()
    cost = -1/m * np.sum(Y_one_hot * np.log(AL+epsilon))
    return cost

def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

def batch_normalize_backward(dZ_tilde, cache):
    Z, Z_normalized, mean, var, gamma, beta = cache
    m = Z.shape[1]

    dZ_normalized = dZ_tilde * gamma
    dVar = np.sum(dZ_normalized * (Z - mean), axis=1, keepdims=True) * -0.5 * (var + 1e-8)**(-1.5)
    dMean = np.sum(dZ_normalized, axis=1, keepdims=True) * -1 / np.sqrt(var + 1e-8)
    
    dZ = (dZ_normalized / np.sqrt(var + 1e-8)) + (dVar * 2 * (Z - mean) / m) + (dMean / m)
    dGamma = np.sum(dZ_tilde * Z_normalized, axis=1, keepdims=True)
    dBeta = np.sum(dZ_tilde, axis=1, keepdims=True)

    return dZ, dGamma, dBeta

def activation_backward(dA, cache, activation):
    Z = cache
    if activation == "relu":
        dZ = dA * relu_derivative(Z)
    elif activation == "softmax":
        dZ = dA

    return dZ

def backward_propagation_deep(AL, Y, caches, activations):
    grads = {}
    L = len(caches)

    # Convert Y to one-hot encoded matrix
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()

    # Compute gradient of the cost with respect to AL for softmax activation
    dAL = AL - Y_one_hot
    grads['dAL'] = dAL
    
    # Last layer (softmax activation)
    current_cache = caches[L-1]
    linear_cache, batch_cache, activation_cache = current_cache
    dZL_tilde = activation_backward(dAL, activation_cache, "softmax")
    dZL, dGammaL, dBetaL = batch_normalize_backward(dZL_tilde, batch_cache)
    dA_prev, dW, db = linear_backward(dZL, linear_cache)
    grads[f'dA{L-1}'], grads[f'dW{L}'], grads[f'db{L}'] = dA_prev, dW, db
    grads[f'dGamma{L}'], grads[f'dBeta{L}'] = dGammaL, dBetaL

    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        linear_cache, batch_cache, activation_cache = current_cache
        dZ_tilde = activation_backward(grads[f'dA{l+1}'], activation_cache, "relu")
        dZ, dGamma, dBeta = batch_normalize_backward(dZ_tilde, batch_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        grads[f'dA{l}'], grads[f'dW{l+1}'], grads[f'db{l+1}'] = dA_prev, dW, db
        grads[f'dGamma{l+1}'], grads[f'dBeta{l+1}'] = dGamma, dBeta

    return grads

def initialize_adam(parameters):
    L = len(parameters) // 4  # Considering gamma and beta for each layer
    v = {}
    s = {}

    for l in range(1, L+1):
        v[f'dW{l}'] = np.zeros_like(parameters[f'W{l}'])
        v[f'db{l}'] = np.zeros_like(parameters[f'b{l}'])
        v[f'dGamma{l}'] = np.zeros_like(parameters[f'gamma{l}'])
        v[f'dBeta{l}'] = np.zeros_like(parameters[f'beta{l}'])

        s[f'dW{l}'] = np.zeros_like(parameters[f'W{l}'])
        s[f'db{l}'] = np.zeros_like(parameters[f'b{l}'])
        s[f'dGamma{l}'] = np.zeros_like(parameters[f'gamma{l}'])
        s[f'dBeta{l}'] = np.zeros_like(parameters[f'beta{l}'])

    return v, s

def update_parameters_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    L = len(parameters) // 4  # Considering gamma and beta for each layer

    for l in range(1, L+1):
        v[f'dW{l}'] = beta1 * v[f'dW{l}'] + (1 - beta1) * grads[f'dW{l}']
        v[f'db{l}'] = beta1 * v[f'db{l}'] + (1 - beta1) * grads[f'db{l}']
        v[f'dGamma{l}'] = beta1 * v[f'dGamma{l}'] + (1 - beta1) * grads[f'dGamma{l}']
        v[f'dBeta{l}'] = beta1 * v[f'dBeta{l}'] + (1 - beta1) * grads[f'dBeta{l}']

        s[f'dW{l}'] = beta2 * s[f'dW{l}'] + (1 - beta2) * (grads[f'dW{l}']**2)
        s[f'db{l}'] = beta2 * s[f'db{l}'] + (1 - beta2) * (grads[f'db{l}']**2)
        s[f'dGamma{l}'] = beta2 * s[f'dGamma{l}'] + (1 - beta2) * (grads[f'dGamma{l}']**2)
        s[f'dBeta{l}'] = beta2 * s[f'dBeta{l}'] + (1 - beta2) * (grads[f'dBeta{l}']**2)

        v_corrected_dW = v[f'dW{l}'] / (1 - beta1**t)
        v_corrected_db = v[f'db{l}'] / (1 - beta1**t)
        v_corrected_dGamma = v[f'dGamma{l}'] / (1 - beta1**t)
        v_corrected_dBeta = v[f'dBeta{l}'] / (1 - beta1**t)

        s_corrected_dW = s[f'dW{l}'] / (1 - beta2**t)
        s_corrected_db = s[f'db{l}'] / (1 - beta2**t)
        s_corrected_dGamma = s[f'dGamma{l}'] / (1 - beta2**t)
        s_corrected_dBeta = s[f'dBeta{l}'] / (1 - beta2**t)

        parameters[f'W{l}'] -= learning_rate * v_corrected_dW / (np.sqrt(s_corrected_dW) + epsilon)
        parameters[f'b{l}'] -= learning_rate * v_corrected_db / (np.sqrt(s_corrected_db) + epsilon)
        parameters[f'gamma{l}'] -= learning_rate * v_corrected_dGamma / (np.sqrt(s_corrected_dGamma) + epsilon)
        parameters[f'beta{l}'] -= learning_rate * v_corrected_dBeta / (np.sqrt(s_corrected_dBeta) + epsilon)

    return parameters

def nn_deep_model_adam(X_train, Y_train, X_test, Y_test, layer_dims, num_iterations=1000, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    activations = ["relu"] * (len(layer_dims) - 2) + ["softmax"]
    
    # Initialize parameters
    parameters = initialize_parameters_deep(layer_dims)

    # Initialize Adam variables
    v, s = initialize_adam(parameters)
    t = 0  # Initialize timestep for Adam

    # Train the neural network with Adam
    for i in range(num_iterations):
        AL, caches = forward_propagation_deep(X_train, parameters, activations)
        cost = compute_cost(AL, Y_train)
        grads = backward_propagation_deep(AL, Y_train, caches, activations)

        # Update parameters with Adam
        t += 1
        parameters = update_parameters_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2, epsilon)

        if i % 1000 == 0:
            print(f'Cost after iteration {i}: {cost}')
            # print(f'Mean activation: {np.mean(AL)}, Std activation: {np.std(AL)}')
            # print(f'Mean dAL: {np.mean(grads["dAL"])}, Std dAL: {np.std(grads["dAL"])}')

    # Make predictions on the training set
    train_predictions = predict_deep(parameters, X_train, activations)

    # Make predictions on the test set
    test_predictions = predict_deep(parameters, X_test, activations)

    # Evaluate the model on the training set
    print("Training Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_train), np.squeeze(train_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_train), np.squeeze(train_predictions)))

    # Evaluate the model on the test set
    print("\nTest Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_test), np.squeeze(test_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_test), np.squeeze(test_predictions)))


In [52]:
np.random.seed(3)
nn_deep_model_adam(X_train_nn, y_train_nn, X_test_nn, y_test_nn, layer_dims, num_iterations=6000, learning_rate=0.009)

Cost after iteration 0: 1.5282857458436325
Cost after iteration 1000: 0.23998376961836484
Cost after iteration 2000: 0.22930793412668607
Cost after iteration 3000: 0.22162675885602443
Cost after iteration 4000: 0.21067306217481374
Cost after iteration 5000: 0.2070925053314733
Training Set Evaluation:
Confusion Matrix:
[[393   1   1   0]
 [ 10 382  17   0]
 [  2  17 351  38]
 [  0   0  47 341]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       395
           1       0.95      0.93      0.94       409
           2       0.84      0.86      0.85       408
           3       0.90      0.88      0.89       388

    accuracy                           0.92      1600
   macro avg       0.92      0.92      0.92      1600
weighted avg       0.92      0.92      0.92      1600


Test Set Evaluation:
Confusion Matrix:
[[92 13  0  0]
 [17 63 11  0]
 [ 0 37 42 13]
 [ 0  5 21 86]]

Classification Report:
              preci

Now we see an improvement in our training set performance which we weren't able to achieve before and there is a considerable reduction in the bias. Now we can shift our focus to the generalization of the model and the reduction of high variance (which is evident from the huge difference between the training set and test set performance)

### L2 Regularization

In [53]:
np.random.seed(3)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def batch_normalize(Z):
    mean = np.mean(Z, axis=1, keepdims=True)
    var = np.var(Z, axis=1, keepdims=True)
    Z_normalized = (Z - mean) / np.sqrt(var + 1e-8)
    return Z_normalized, mean, var

def initialize_parameters_deep(layer_dims):
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l-1])
        parameters[f'b{l}'] = np.zeros((layer_dims[l], 1))
        parameters[f'gamma{l}'] = np.ones((layer_dims[l], 1))
        parameters[f'beta{l}'] = np.zeros((layer_dims[l], 1))

    return parameters

def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

def batch_normalize_forward(Z, gamma, beta):
    Z_normalized, mean, var = batch_normalize(Z)
    Z_tilde = gamma * Z_normalized + beta
    cache = (Z, Z_normalized, mean, var, gamma, beta)
    return Z_tilde, cache

def activation_forward(Z, activation):
    if activation == "relu":
        A = relu(Z)
    elif activation == "softmax":
        A = softmax(Z)

    cache = Z
    return A, cache

def forward_propagation_deep(X, parameters, activations):
    caches = []
    A = X
    L = len(parameters) // 4  # Considering gamma and beta for each layer

    for l in range(1, L):
        A_prev = A
        Z, linear_cache = linear_forward(A_prev, parameters[f'W{l}'], parameters[f'b{l}'])
        Z_tilde, batch_cache = batch_normalize_forward(Z, parameters[f'gamma{l}'], parameters[f'beta{l}'])
        A, activation_cache = activation_forward(Z_tilde, "relu")
        cache = (linear_cache, batch_cache, activation_cache)
        caches.append(cache)

    # Last layer (softmax activation)
    ZL, linear_cache = linear_forward(A, parameters[f'W{L}'], parameters[f'b{L}'])
    ZL_tilde, batch_cache = batch_normalize_forward(ZL, parameters[f'gamma{L}'], parameters[f'beta{L}'])
    AL, activation_cache = activation_forward(ZL_tilde, "softmax")
    cache = (linear_cache, batch_cache, activation_cache)
    caches.append(cache)

    return AL, caches

def compute_cost(AL, Y, parameters, lambd=0):
    m = Y.shape[1]
    epsilon = 1e-8
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()
    cross_entropy_cost = -1/m * np.sum(Y_one_hot * np.log(AL+epsilon))
    
    l2_regularization_cost = 0
    L = len(parameters) // 4  # Considering gamma and beta for each layer
    for l in range(1, L+1):
        W = parameters[f'W{l}']
        l2_regularization_cost += np.sum(W**2)

    l2_regularization_cost *= (lambd / (2 * m))
    cost = cross_entropy_cost + l2_regularization_cost

    return cost

def linear_backward(dZ, cache, lambd=0):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1/m * np.dot(dZ, A_prev.T) + (lambd / m) * W
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

def backward_propagation_deep(AL, Y, caches, activations, lambd=0):
    grads = {}
    L = len(caches)

    # Convert Y to one-hot encoded matrix
    Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()

    # Compute gradient of the cost with respect to AL for softmax activation
    dAL = AL - Y_one_hot
    grads['dAL'] = dAL
    
    # Last layer (softmax activation)
    current_cache = caches[L-1]
    linear_cache, batch_cache, activation_cache = current_cache
    dZL_tilde = activation_backward(dAL, activation_cache, "softmax")
    dZL, dGammaL, dBetaL = batch_normalize_backward(dZL_tilde, batch_cache)
    dA_prev, dW, db = linear_backward(dZL, linear_cache, lambd)
    grads[f'dA{L-1}'], grads[f'dW{L}'], grads[f'db{L}'] = dA_prev, dW, db
    grads[f'dGamma{L}'], grads[f'dBeta{L}'] = dGammaL, dBetaL

    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        linear_cache, batch_cache, activation_cache = current_cache
        dZ_tilde = activation_backward(grads[f'dA{l+1}'], activation_cache, "relu")
        dZ, dGamma, dBeta = batch_normalize_backward(dZ_tilde, batch_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
        grads[f'dA{l}'], grads[f'dW{l+1}'], grads[f'db{l+1}'] = dA_prev, dW, db
        grads[f'dGamma{l+1}'], grads[f'dBeta{l+1}'] = dGamma, dBeta

    return grads

def initialize_adam(parameters):
    L = len(parameters) // 4  # Considering gamma and beta for each layer
    v = {}
    s = {}

    for l in range(1, L+1):
        v[f'dW{l}'] = np.zeros_like(parameters[f'W{l}'])
        v[f'db{l}'] = np.zeros_like(parameters[f'b{l}'])
        v[f'dGamma{l}'] = np.zeros_like(parameters[f'gamma{l}'])
        v[f'dBeta{l}'] = np.zeros_like(parameters[f'beta{l}'])

        s[f'dW{l}'] = np.zeros_like(parameters[f'W{l}'])
        s[f'db{l}'] = np.zeros_like(parameters[f'b{l}'])
        s[f'dGamma{l}'] = np.zeros_like(parameters[f'gamma{l}'])
        s[f'dBeta{l}'] = np.zeros_like(parameters[f'beta{l}'])

    return v, s

def update_parameters_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    L = len(parameters) // 4  # Considering gamma and beta for each layer

    for l in range(1, L+1):
        v[f'dW{l}'] = beta1 * v[f'dW{l}'] + (1 - beta1) * grads[f'dW{l}']
        v[f'db{l}'] = beta1 * v[f'db{l}'] + (1 - beta1) * grads[f'db{l}']
        v[f'dGamma{l}'] = beta1 * v[f'dGamma{l}'] + (1 - beta1) * grads[f'dGamma{l}']
        v[f'dBeta{l}'] = beta1 * v[f'dBeta{l}'] + (1 - beta1) * grads[f'dBeta{l}']

        s[f'dW{l}'] = beta2 * s[f'dW{l}'] + (1 - beta2) * (grads[f'dW{l}']**2)
        s[f'db{l}'] = beta2 * s[f'db{l}'] + (1 - beta2) * (grads[f'db{l}']**2)
        s[f'dGamma{l}'] = beta2 * s[f'dGamma{l}'] + (1 - beta2) * (grads[f'dGamma{l}']**2)
        s[f'dBeta{l}'] = beta2 * s[f'dBeta{l}'] + (1 - beta2) * (grads[f'dBeta{l}']**2)

        v_corrected_dW = v[f'dW{l}'] / (1 - beta1**t)
        v_corrected_db = v[f'db{l}'] / (1 - beta1**t)
        v_corrected_dGamma = v[f'dGamma{l}'] / (1 - beta1**t)
        v_corrected_dBeta = v[f'dBeta{l}'] / (1 - beta1**t)

        s_corrected_dW = s[f'dW{l}'] / (1 - beta2**t)
        s_corrected_db = s[f'db{l}'] / (1 - beta2**t)
        s_corrected_dGamma = s[f'dGamma{l}'] / (1 - beta2**t)
        s_corrected_dBeta = s[f'dBeta{l}'] / (1 - beta2**t)

        parameters[f'W{l}'] -= learning_rate * v_corrected_dW / (np.sqrt(s_corrected_dW) + epsilon)
        parameters[f'b{l}'] -= learning_rate * v_corrected_db / (np.sqrt(s_corrected_db) + epsilon)
        parameters[f'gamma{l}'] -= learning_rate * v_corrected_dGamma / (np.sqrt(s_corrected_dGamma) + epsilon)
        parameters[f'beta{l}'] -= learning_rate * v_corrected_dBeta / (np.sqrt(s_corrected_dBeta) + epsilon)

    return parameters

def nn_deep_model_adam_with_regularization(X_train, Y_train, X_test, Y_test, layer_dims, num_iterations=1000, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8, lambd=0):
    activations = ["relu"] * (len(layer_dims) - 2) + ["softmax"]
    
    # Initialize parameters
    parameters = initialize_parameters_deep(layer_dims)

    # Initialize Adam variables
    v, s = initialize_adam(parameters)
    t = 0  # Initialize timestep for Adam

    # Train the neural network with Adam and L2 regularization
    for i in range(num_iterations):
        AL, caches = forward_propagation_deep(X_train, parameters, activations)
        cost = compute_cost(AL, Y_train, parameters, lambd)
        grads = backward_propagation_deep(AL, Y_train, caches, activations, lambd)

        # Update parameters with Adam
        t += 1
        parameters = update_parameters_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2, epsilon)

        if i % 1000 == 0:
            print(f'Cost after iteration {i}: {cost}')
            # print(f'Mean activation: {np.mean(AL)}, Std activation: {np.std(AL)}')
            # print(f'Mean dAL: {np.mean(grads["dAL"])}, Std dAL: {np.std(grads["dAL"])}')

    # Make predictions on the training set
    train_predictions = predict_deep(parameters, X_train, activations)

    # Make predictions on the test set
    test_predictions = predict_deep(parameters, X_test, activations)

    # Evaluate the model on the training set
    print("Training Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_train), np.squeeze(train_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_train), np.squeeze(train_predictions)))

    # Evaluate the model on the test set
    print("\nTest Set Evaluation:")
    print("Confusion Matrix:")
    print(confusion_matrix(np.squeeze(Y_test), np.squeeze(test_predictions)))

    print("\nClassification Report:")
    print(classification_report(np.squeeze(Y_test), np.squeeze(test_predictions)))


In [56]:
np.random.seed(3)
layer_dims = [X_train_nn.shape[0], 10, 8, 6, 4]
nn_deep_model_adam_with_regularization(X_train_nn, y_train_nn, X_test_nn, y_test_nn, layer_dims, num_iterations=8000, learning_rate=0.009, lambd=0.09)

Cost after iteration 0: 1.5299763900664638
Cost after iteration 1000: 0.24564571729442108
Cost after iteration 2000: 0.22642048192681302
Cost after iteration 3000: 0.2234284617550347
Cost after iteration 4000: 0.2218453580607072
Cost after iteration 5000: 0.2174381172857525
Cost after iteration 6000: 0.2118476187689906
Cost after iteration 7000: 0.21463720202722342
Training Set Evaluation:
Confusion Matrix:
[[385  10   0   0]
 [  7 382  19   1]
 [  1  14 354  39]
 [  0   1  43 344]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       395
           1       0.94      0.93      0.94       409
           2       0.85      0.87      0.86       408
           3       0.90      0.89      0.89       388

    accuracy                           0.92      1600
   macro avg       0.92      0.92      0.92      1600
weighted avg       0.92      0.92      0.92      1600


Test Set Evaluation:
Confusion Matrix:
[[91 14  0  0

lambda = 0.01 : test accuracy = 71%
lambda = 0.05 : test accuracy = 74%
