In [1]:
import numpy as np
from Neural_Networks import Logistic_NN, NN_2_layer

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
mobile_train = pd.read_csv('mobile_train.csv')
mobile_test = pd.read_csv('mobile_test.csv')
mobile_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
mobile_train_vif = mobile_train.drop(['price_range'], axis=1)

def calculate_vif(data_frame):
    features = data_frame.columns
    vif_data = pd.DataFrame()
    vif_data["Feature"] = features
    vif_data["VIF"] = [variance_inflation_factor(data_frame.values, i) for i in range(data_frame.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)
    
def drop_high_vif_features(data_frame, threshold=5):
    while True:
        vif_results = calculate_vif(data_frame)
        max_vif_feature = vif_results.loc[vif_results['VIF'].idxmax(), 'Feature']
        max_vif_value = vif_results.loc[vif_results['VIF'].idxmax(), 'VIF']
        
        if max_vif_value > threshold:
            print(f"Dropping feature '{max_vif_feature}' with VIF {max_vif_value}")
            data_frame = data_frame.drop(columns=max_vif_feature)
        else:
            break
    return data_frame
mobile_train_vif = drop_high_vif_features(mobile_train_vif)

Dropping feature 'mobile_wt' with VIF 12.972548425819065
Dropping feature 'px_width' with VIF 11.470014131904488
Dropping feature 'sc_h' with VIF 11.086593845458365
Dropping feature 'battery_power' with VIF 7.543843177190293
Dropping feature 'pc' with VIF 6.050059878559392
Dropping feature 'three_g' with VIF 5.930418164840767


In [5]:
X_vif = mobile_train_vif
y_vif = mobile_train['price_range']
X_train_vif, X_test_vif, y_train_vif, y_test_vif = train_test_split(X_vif, y_vif, test_size=0.2, random_state=42)
## now lets standardize the input data
scaler = StandardScaler()
X_train_vif_scaled = scaler.fit_transform(X_train_vif)
X_test_vif_scaled = scaler.transform(X_test_vif)

In [6]:
# transforming the data into dimensions (n,m) where m denoted the number of examples for ease of computation
X_train_nn, X_test_nn = X_train_vif_scaled.T, X_test_vif_scaled.T
y_train_nn,y_test_nn = y_train_vif.to_numpy().reshape((1,1600)), y_test_vif.to_numpy().reshape((1,400))

In [7]:
model1 = Logistic_NN(4, 0.01, 1000)

In [8]:
model1.fit(X_train_nn,y_train_nn)

cost after 0 iterations: 1.3927288487196372
cost after 200 iterations: 1.1071201404733622
cost after 400 iterations: 0.9818802462425121
cost after 600 iterations: 0.9105988122116826
cost after 800 iterations: 0.8622213190840751


(array([[-0.03778215,  0.01316373, -0.02107592,  0.02475468],
        [ 0.04654256, -0.04050454,  0.0413891 , -0.01604318],
        [-0.00195   ,  0.02253812, -0.04105334,  0.01153104],
        [-0.04915216,  0.04581861,  0.02736361, -0.0140979 ],
        [-0.01800262,  0.01562074, -0.04441215,  0.04191897],
        [-0.02983307,  0.0108382 , -0.09916502,  0.10562712],
        [-0.04985427,  0.06953424, -0.05319474,  0.01093517],
        [ 0.01748953, -0.08883164,  0.0560741 ,  0.01203026],
        [-0.3062876 ,  0.01495908,  0.04980938,  0.2567906 ],
        [-1.3917064 , -0.48041048,  0.50123622,  1.37353398],
        [-0.01277522, -0.0155625 ,  0.00651936,  0.04153273],
        [-0.05317597,  0.06113959, -0.04668044,  0.02492847],
        [ 0.0263453 ,  0.02799084, -0.0491576 , -0.01046376],
        [-0.03136397,  0.01893887, -0.02703963, -0.0115669 ]]),
 array([[-0.19440471],
        [ 0.20194154],
        [ 0.19731197],
        [-0.2048488 ]]))

In [9]:
predictions = model1.predict(X_test_nn)
model1.accuracy(predictions, y_test_nn)

0.72

In [11]:
np.random.seed(12)
trial = NN_2_layer(num_classes=4, num_hidden=4, learning_rate = 0.009, num_iters = 10000)

In [12]:
trial.fit(X_train_nn, y_train_nn)

cost after 0 iterations: 1.9452863185979419
cost after 1000 iterations: 1.0245489233062972
cost after 2000 iterations: 0.7720318080265752
cost after 3000 iterations: 0.646927920893106
cost after 4000 iterations: 0.5725058776055323
cost after 5000 iterations: 0.5292718885597846
cost after 6000 iterations: 0.5031585969257354
cost after 7000 iterations: 0.48599274300321726
cost after 8000 iterations: 0.473966609566506
cost after 9000 iterations: 0.4648335181707388


In [13]:
pred = trial.predict(X_test_nn, y_test_nn)

Accuracy: 0.7975


In [14]:
class neural_network():

    def __init__(
        layer_dims, 
        num_iterations=1000, 
        learning_rate=0.01, 
        beta1=0.9, 
        beta2=0.999, 
        epsilon=1e-8
    ):
        self.layer_dims = layer_dims
        self.num_iters = num_iterations
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.parameters = {}
        self.grads = {}

    def tanh(self, x):
        return np.tanh(x)

    def tanh_derivative(self, x):
        return 1 - np.tanh(x)**2
    
    def relu(self, x):
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        return np.where(x > 0, 1, 0)
    
    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # for numerical stability
        return exp_z / np.sum(exp_z, axis=0, keepdims=True)

    def batch_normalize(self, Z):
        """
        Helper function to compute the normalized value of Z of any layer
        
        Input: 
        the Z matrix corresponding to a specific layer 
        with dimensions (n_l,m). where n_l - number of units in layer l, m - number of training examples in training data

        Output:
        The normalized Z matrix with the same dimension as Z
        Mean and variance are computed using the Zs corresponding to different training examples (sample size is m)
        
        """
        mean = np.mean(Z, axis=1, keepdims=True)
        var = np.var(Z, axis=1, keepdims=True)
        Z_normalized = (Z - mean) / np.sqrt(var + 1e-8)
        return Z_normalized, mean, var

    def linear_forward(self, A, W, b):
        """
        Helper function used to calculate the Z matrix of a layer l
        
        Inputs:
        A - activations from the previous layer. Dimension - (number of units in layer l-1, total number of examples)
        W - weights matrix of current layer. Dimension - (number of units in layer l, number of units in layer l-1)
        b - bias of current layer. Dimension - (number of units in layer l, 1)

        Outputs:
        The Z matrix along with all the inputs stored in cache useful in back propogation
        """
        Z = np.dot(W, A) + b
        cache = (A, W, b)
        return Z, cache

    def batch_normalize_forward(self, Z, gamma, beta):
        """
        Helper function to calculate the Z tilde to be fed to the activation function before feeding to the next layer

        Inputs:
        Z - The Z matrix computed from linear_forward function
        gamma - the parameter gamma associated with normalization. Dimension - (number of units in current layer, 1)
        beta - the parameter associated with normalization. Dimension - (number of units in current layer, 1)

        Outputs:
        Z_tilde - the scaled version of Z_normalized obtained from batch_normalize
        """
        Z_normalized, mean, var = self.batch_normalize(Z)
        Z_tilde = gamma * Z_normalized + beta # this is element wise multiplication 
        cache = (Z, Z_normalized, mean, var, gamma, beta)
        return Z_tilde, cache

    def activation_forward(self, Z, activation):
        """
        Helper function to compute the activation of Z_tilde obtained from previous function.
        Applied the activation based on the choice.
        """
        if activation == "relu":
            A = self.relu(Z)
        elif activation =="tanh":
            A = self.tanh(Z)
        elif activation == "softmax":
            A = self.softmax(Z)
    
        cache = Z
        return A, cache

    def initialize_parameters_deep(self):
        """
        Initializing the weights using xavier's initialization(for ReLU activation) 
        for all the layers according the layer dimension info given by layer_dims

        Dimensions of parameters used in forward propogation:
        Weights - (number of units in current layer, number of units in prev layer)
        b - (number of units in current layer, 1)

        Dimensions of parameters used in batch normalization:
        gamma - (number of units in current layer, 1)
        beta - (number of units in current layer, 1)
        """
        L = len(self.layer_dims)
    
        for l in range(1, L):
            # following the logic that the dimension of the weights of any layer will be 
            # (number of units in current layer, number of units in prev layer)
            self.parameters[f'W{l}'] = np.random.randn(self.layer_dims[l], self.layer_dims[l-1]) * np.sqrt(2 / self.layer_dims[l-1])

            # b, gamma and beta parameters all will will have a dimension that depends on the number of units in the current layer
            # given by (number of units in current layer, 1)
            self.parameters[f'b{l}'] = np.zeros((self.layer_dims[l], 1))
            self.parameters[f'gamma{l}'] = np.ones((self.layer_dims[l], 1))
            self.parameters[f'beta{l}'] = np.zeros((self.layer_dims[l], 1))
    
        return

    def forward_propagation_deep(self, X, activations):
        """

        Inputs:
        Activations - this is a list of size (number_layers - 1) L which basically tells about the activation to use for each layer
        for eg. ['relu', 'relu', 'softmax'] means that the first 2 hidden layers will have relu activation and the
        output layer will have softmax activation
        """
        caches = []
        A = X
        L = len(self.parameters) // 4  # There are 4 parameters for each layer W,b,gamma,beta
    
        for l in range(1, L):
            # A_prev gets updated in each iteration as A keeps getting updated in each iteration
            A_prev = A
            
            # the linear_forward function returns Z, (A_prev, W[l], b[l]) stored in cache
            Z, linear_cache = self.linear_forward(A_prev, self.parameters[f'W{l}'], self.parameters[f'b{l}'])
            
            # the batch_normalize_forward returns Z_tilde, (Z, Z_normalized, mean, var, gamma, beta) stored in cache
            Z_tilde, batch_cache = self.batch_normalize_forward(Z, self.parameters[f'gamma{l}'], self.parameters[f'beta{l}'])
            
            # the activation_forward returns A, (Z) stored in cache
            A, activation_cache = self.activation_forward(Z_tilde, "relu")

            # storing all types of cache to one variable
            cache = (linear_cache, batch_cache, activation_cache)

            # appending that to the caches list. caches will contain the 3 types of cache for all the layers
            # the ith item in caches list will have all the cache pertaining to the ith layer in the network
            caches.append(cache)
    
        # Last layer (softmax activation)
        ZL, linear_cache = self.linear_forward(A, self.parameters[f'W{L}'], self.parameters[f'b{L}'])
        ZL_tilde, batch_cache = self.batch_normalize_forward(ZL, self.parameters[f'gamma{L}'], self.parameters[f'beta{L}'])
        AL, activation_cache = self.activation_forward(ZL_tilde, "softmax")
        cache = (linear_cache, batch_cache, activation_cache)
        caches.append(cache)
    
        return AL, caches

    def compute_cost(self, AL, Y):
        """
        Helper function to compute the cost function that compares the activations of the final layer and the target labels.
        Assumes that the target labels Y has a dimension (m,) where m denotes the number of examples
        """
        Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()
        cost = -1/self.m * np.sum(Y_one_hot * np.log(AL+self.epsilon))
        return cost

    def activation_backward(self, dA, cache, activation):
        """
        Helper function to calculate the gradients of Z_tilde because we get activations of any layer from Z_tilde
        So in backward propogation we get dZ_tilde from dA

        Inputs:
        dA - derivative of the cost function with respect to the activation of that layer 
        so for the last layer L it'll be dcost/dAL

        Output:
        dZ - which is actually dZ_tilde. we get the derivative of the cost function with respect to 
        Z_tilde of that layer. so for the last layer L it'll be dcost/dZL_tilde = (dcost/dAL)*(dAL/dZL_tilde) by chain rule
        """
        Z = cache
        if activation == "relu":
            dZ = dA * self.relu_derivative(Z)
        elif activation == "tanh":
            dZ = dA * self.tanh_derivative(Z)
        elif activation == "softmax":
            dZ = dA
        return dZ

    def batch_normalize_backward(self, dZ_tilde, cache):
        """
        Helper function to compute the gradients of Z, gamma, beta because in forward propogation we get Z_tilde from Z, gamma, beta
        so in backward propogation we get dZ, dGamma, dBeta from dZ_tilde

        Inputs:
        dZ_tilde - derivative of the cost function with respect to Z_tilde of that layer. 
        so for the last layer it will be dcost/dZL_tilde

        Output:
        dZ - derivative of the cost function with respect to Z of that layer.
        so for last layer it will be dcost/dZL = (dcost/dZL_tilde)*(dZL_tilde/dZL)
        """
        Z, Z_normalized, mean, var, gamma, beta = cache
    
        dZ_normalized = dZ_tilde * gamma # dZL_tilde/dZL = gamma 
        dVar = np.sum(dZ_normalized * (Z - mean), axis=1, keepdims=True) * -0.5 * (var + 1e-8)**(-1.5)
        dMean = np.sum(dZ_normalized, axis=1, keepdims=True) * -1 / np.sqrt(var + 1e-8)
        
        dZ = (dZ_normalized / np.sqrt(var + 1e-8)) + (dVar * 2 * (Z - mean) / self.m) + (dMean / self.m)
        dGamma = np.sum(dZ_tilde * Z_normalized, axis=1, keepdims=True)
        dBeta = np.sum(dZ_tilde, axis=1, keepdims=True)
    
        return dZ, dGamma, dBeta

    def linear_backward(self, dZ, cache):
        """
        Helper function to compute the gradients of A_prev, W, b because in forward propogation we get Z from A_prev, W, b
        so in backward propogation we get dA_prev, dW, db from dZ

        Inputs:
        dZ - derivative of the cost function with respect to Z of that layer. 
        so for the last layer it will be dcost/dZL

        Output:
        dA_prev, dW, db - derivative of the cost function with respect to W, b of that layer and A of previous layer.
        so for last layer it will be dcost/dWL = (dcost/dZL)*(dZL/dWL) and so on for all 3 of them
        """
        A_prev, W, b = cache
        dW = 1/self.m * np.dot(dZ, A_prev.T)
        db = 1/self.m * np.sum(dZ, axis=1, keepdims=True)
        dA_prev = np.dot(W.T, dZ)
    
        return dA_prev, dW, db

    def backward_propagation_deep(self, AL, Y, caches, activations):
        L = len(caches)
    
        # Convert Y to one-hot encoded matrix
        Y_one_hot = np.eye(AL.shape[0])[Y.astype(int)].T.squeeze()
    
        # Compute gradient of the cost with respect to AL for softmax activation
        dAL = AL - Y_one_hot
        self.grads['dAL'] = dAL
        
        # Last layer (softmax activation)
        current_cache = caches[L-1]
        linear_cache, batch_cache, activation_cache = current_cache
        dZL_tilde = self.activation_backward(dAL, activation_cache, "softmax")
        dZL, dGammaL, dBetaL = self.batch_normalize_backward(dZL_tilde, batch_cache)
        dA_prev, dW, db = self.linear_backward(dZL, linear_cache)
        self.grads[f'dA{L-1}'], self.grads[f'dW{L}'], self.grads[f'db{L}'] = dA_prev, dW, db
        self.grads[f'dGamma{L}'], self.grads[f'dBeta{L}'] = dGammaL, dBetaL
    
        # Loop from l=L-2 to l=0
        for l in reversed(range(L-1)):
            current_cache = caches[l]
            linear_cache, batch_cache, activation_cache = current_cache
            dZ_tilde = self.activation_backward(grads[f'dA{l+1}'], activation_cache, "relu")
            dZ, dGamma, dBeta = self.batch_normalize_backward(dZ_tilde, batch_cache)
            dA_prev, dW, db = self.linear_backward(dZ, linear_cache)
            self.grads[f'dA{l}'], self.grads[f'dW{l+1}'], self.grads[f'db{l+1}'] = dA_prev, dW, db
            self.grads[f'dGamma{l+1}'], self.grads[f'dBeta{l+1}'] = dGamma, dBeta
    
        return 