# Training a Deep Neural Network
The following code implements a deep neural network of 2 hidden layers with backpropagation using low-level libraries and compares it with a model generated by Scikit-learn.

## 1 Data Loading & Cleaning
The data set contains credit card debt information about 10,000 customers and whether they defaulted or not.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading the data
df = pd.read_csv('Default.csv')
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [3]:
# Scaling and converting to NumPy arrays
df['default']=df['default'].apply(lambda x: 0 if x=='No' else 1)
df['student']=df['student'].apply(lambda x: 0 if x=='No' else 1)

In [4]:
df.head()

Unnamed: 0,default,student,balance,income
0,0,0,729.526495,44361.625074
1,0,1,817.180407,12106.1347
2,0,0,1073.549164,31767.138947
3,0,0,529.250605,35704.493935
4,0,0,785.655883,38463.495879


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   default  10000 non-null  int64  
 1   student  10000 non-null  int64  
 2   balance  10000 non-null  float64
 3   income   10000 non-null  float64
dtypes: float64(2), int64(2)
memory usage: 312.6 KB


In [6]:
scaler = StandardScaler()
df[['balance','income']] = scaler.fit_transform(df[['balance','income']])
df

Unnamed: 0,default,student,balance,income
0,0,0,-0.218835,0.813187
1,0,1,-0.037616,-1.605496
2,0,0,0.492410,-0.131212
3,0,0,-0.632893,0.164031
4,0,0,-0.102791,0.370915
...,...,...,...,...
9995,0,0,-0.255990,1.460366
9996,0,0,-0.160044,-1.039014
9997,0,0,0.020751,1.883565
9998,0,0,1.516742,0.236363


In [7]:
Y = df['default'].to_numpy().reshape(-1,1)
X = df.drop(columns=['default']).to_numpy()

In [8]:
print("Shape of Y:",Y.shape)
print("Shape of X:",X.shape)

Shape of Y: (10000, 1)
Shape of X: (10000, 3)


In [9]:
X = X.T
Y = Y.T

print("Shape of Y:",Y.shape)
print("Shape of X:",X.shape)

Shape of Y: (1, 10000)
Shape of X: (3, 10000)


## 2 Training a Deep Neural Network Using Scikit-learn
The following code trains a deep neural network of 2 hidden layers with 4 neurons in each hidden layer using scikit-learn.

In [10]:
from sklearn.neural_network import MLPClassifier

In [11]:
m = Y.shape[1]
mlp = MLPClassifier(hidden_layer_sizes=(4,4),activation='tanh',solver='sgd',alpha=0,learning_rate_init=0.01,max_iter=2000,batch_size=m,shuffle=False,momentum=0,verbose=True)
mlp.fit(X.T,Y.T)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 1.12991523
Iteration 2, loss = 1.10932788
Iteration 3, loss = 1.08910561
Iteration 4, loss = 1.06924971
Iteration 5, loss = 1.04976094
Iteration 6, loss = 1.03063957
Iteration 7, loss = 1.01188540
Iteration 8, loss = 0.99349777
Iteration 9, loss = 0.97547556
Iteration 10, loss = 0.95781724
Iteration 11, loss = 0.94052089
Iteration 12, loss = 0.92358417
Iteration 13, loss = 0.90700442
Iteration 14, loss = 0.89077864
Iteration 15, loss = 0.87490349
Iteration 16, loss = 0.85937539
Iteration 17, loss = 0.84419044
Iteration 18, loss = 0.82934454
Iteration 19, loss = 0.81483334
Iteration 20, loss = 0.80065232
Iteration 21, loss = 0.78679674
Iteration 22, loss = 0.77326173
Iteration 23, loss = 0.76004227
Iteration 24, loss = 0.74713322
Iteration 25, loss = 0.73452931
Iteration 26, loss = 0.72222521
Iteration 27, loss = 0.71021550
Iteration 28, loss = 0.69849470
Iteration 29, loss = 0.68705728
Iteration 30, loss = 0.67589768
Iteration 31, loss = 0.66501030
Iteration 32, los

In [12]:
print("\nWeights and biases")
print("W1:",mlp.coefs_[0].T)
print("b1:",mlp.intercepts_[0].reshape(-1,1))
print("W2:",mlp.coefs_[1].T)
print("b2:",mlp.intercepts_[1].reshape(-1,1))
print("W3:",mlp.coefs_[2].T)
print("b3:",mlp.intercepts_[2].reshape(-1,1))


Weights and biases
W1: [[-0.71463019 -0.57655304 -0.06590277]
 [ 0.02376012  0.08793267 -0.022375  ]
 [-0.24629548 -0.09607573 -0.1657701 ]
 [-0.46148914  0.18801803  0.19881984]]
b1: [[-1.13916944]
 [-0.23047559]
 [ 0.4056156 ]
 [-1.17710392]]
W2: [[-0.45038575 -0.18968503 -0.01368623 -0.30938031]
 [-0.59923418 -0.93279314 -0.14422182 -0.93799234]
 [-0.55140271 -0.14558229  0.07554677  0.3972282 ]
 [-0.47213049  0.65578757  0.0098236  -1.21240234]]
b2: [[ 0.92865696]
 [-0.10587238]
 [-0.26617069]
 [-0.17628227]]
W3: [[-1.12575363 -1.14635024 -0.1397987  -1.24219472]]
b3: [[0.45849985]]


## 3 Training a Deep Neural Network Using Backpropagation
The following code implements backpropagation to train a deep neural network of 2 hidden layers with 4 neurons in each hidden layer.

In [13]:
# Initialising the paramaters of the neural network
W = [np.random.rand(4,3),np.random.rand(4,4),np.random.rand(1,4)]
b = [np.zeros((4,1)),np.zeros((4,1)),np.zeros((1,1))]

In [14]:
# Updating parameters using gradient descent
iter = 402
lr = 0.01
loss = np.arange(10,20)
L = 3

for i in np.arange(iter):
    # Forward propagation
    Z = []
    A = []
    for l in np.arange(L):
        Z.append(W[l]@A[l-1]+b[l] if l> 0 else W[l]@X+b[l])
        A.append(1/(1+np.exp(-Z[l])) if l==L-1 else np.tanh(Z[l]))

    # Back propagation
    dZ = [0]*L
    dA = [0]*L
    dW = [0]*L
    db = [0]*L
    
    for l in np.arange(L-1,-1,-1):
        dZ[l] = A[l]-Y if l==L-1 else dA[l]*(1-np.tanh(Z[l])**2)
        dA[l-1] = W[l].T@dZ[l] if l> 0 else 0
        dW[l] = 1/m*dZ[l]@A[l-1].T
        db[l] = 1/m*np.sum(dZ[l],axis=1,keepdims=True)
        W[l] -= lr*dW[l]
        b[l] -= lr*db[l]

    current_loss = -1/m*(Y@np.log(A[L-1]).T+(1-Y)@np.log(1-A[L-1]).T)
    loss = np.append(loss,current_loss)
    loss = np.delete(loss,0)

print("Last iteration:",i+1)
print("Losses:",loss)

print("\nWeights and biases")
print("W1:",W[0])
print("b1:",b[0])
print("W2:",W[1])
print("b2:",b[1])
print("W3:",W[2])
print("b3:",b[2])

Last iteration: 402
Losses: [0.18791801 0.187687   0.18745753 0.18722959 0.18700315 0.18677822
 0.18655477 0.1863328  0.18611229 0.18589323]

Weights and biases
W1: [[ 0.18350402  0.06106043  0.16089564]
 [ 0.14008441  0.00853907  0.90328437]
 [-0.07998509 -0.0903867   0.07160842]
 [-0.01942756  0.36761766  0.66993764]]
b1: [[-0.42209151]
 [-0.15729594]
 [-0.525868  ]
 [-0.25821328]]
W2: [[ 0.60674666  0.48584253  0.8893756   0.82787761]
 [ 0.03799913  0.0919417   0.72888361 -0.03253264]
 [ 0.6760222  -0.04103128  0.19930486  0.08545966]
 [ 0.53991338  0.61439598  0.99447289  0.19597078]]
b2: [[-0.34006464]
 [-0.31962694]
 [-0.37682823]
 [ 0.00467849]]
W3: [[0.81152421 0.57336916 0.6764238  0.04450193]]
b3: [[-0.84748953]]


## 4 Conclusion
Similar values of loss from 2 & 3 for the same number of iterations indicates that the custom gradient descent implementation is correct. The weights and biases are different because the 2 models are randomly initialised during training and the loss function of the shallow neural network has multiple maximia and minima.