# Logistic Regression with Gradient Descent, Python Implementation

Complete the code where indicated implementing logistic legression and gradient descent. Recall that the cost function in logistic regression is

$$ J(W) = \frac{1}{n} \left[ -Y \log\left(h\left( X \right) \right) - \left( 1 - Y\right) \log \left( 1 - h\left( X \right) \right) \right]$$

and the gradient of the cost is a vector of the same length as $W$ defined as follows:

$$ \frac{\partial J(W)}{\partial W} =\nabla_W = \frac{1}{n} \left( h \left( X \right) - Y \right) X $$

Note that while this gradient looks identical to the linear regression gradient, the formula is actually different because linear and logistic regression have different definitions of hypothesis function $h(X)$.


In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('titanic.csv')
data = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']].dropna()
data.loc[data["Sex"] == "male", "Sex"] = 1
data.loc[data["Sex"] == "female", "Sex"] = 0
data = np.array(data)
X, Y = data[:, 1:], data[:, 0]

# normalise all columns of X using min-max
for c in range(X.shape[1]):
    X[:,c] = (max(X[:,c]) -  X[:,c])/(max(X[:,c]) - min(X[:,c]))
    
# break into train/test, with 80% training and 20% test
split = int(0.8 * data.shape[0])

X_train = X[:split]
X_test = X[split:]
Y_train = Y[:split]
Y_test = Y[split:]

# Add intercept term to X_train and X_test
X_train = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], axis=1)
X_test = np.concatenate([np.ones((X_test.shape[0], 1)), X_test], axis=1)

# ================================================================

In [2]:
def sigmoid(z):
    """
    Compute sigmoid function given the input z.
    
    Parameters
    ----------
    z : array_like
    
    Returns
    -------
    g : array_like
        The computed sigmoid function. g has the same shape as z, since
        the sigmoid is computed element-wise on z.
        
    Instructions
    ------------
    Computing the sigmoid of each value of z (z can be a matrix, vector or scalar).
    """
    # converting input to a numpy array
    z = np.array(z).astype("float")
        
    # ====================== CODE HERE ======================  
    g = 1 / (1 + np.exp(-z)) # This is the basic formula for sigmoid
    # =============================================================
    return g

In [3]:
def logreg_costFunction(W, X, Y):
    """
    Compute cost and gradient for logistic regression. 
    
    Parameters
    ----------
    W : m parameters for logistic regression. 
    
    X : The input dataset of shape (n,m) where n is the total number
        of data points and m is the number of features. We assume the 
        intercept has already been added to the input.
    
    Y : Vector of labels for the input with n elements. 
    
    Returns
    -------
    J : float
        The computed value for the cost function. 
    
        
    """
    # Initialize some useful values
    n = Y.size  # number of training examples
    J = 0
    
    # ====================== CODE HERE ======================
    # NOTE: the diff between dot and @ is that dot allow nultiply matrix by scalar, @ does not
    #This is the Elementwise operation(s)
    for i in range(n): #Used element wise calculation instead of Matrix Multiplication
        z = np.dot(X[i] , W)
        h_x = sigmoid(z) # Since h(x) is a sigmoid functiton
        J +=  - Y[i] *np.log(h_x) - (1 - Y[i]) * np.log(1 - h_x) # As per the equation given in the question 
    J = J / n

    #Alternative attempt for Matrix Based calculation, which i did not continue due to ease of understanding of Elementwise
    #Though, I did find the elementwise operation to be costly for compute and takes 4+ minutes to calculates.
    # h = sigmoid(np.dot(X, W))
    #J = -(1 / n) * (Y.dot(np.log(h)) + (1 - Y).dot(np.log(1 - h)))
    # =============================================================
    
    return J

In [4]:
def logreg_GradFunction(W, X, Y):
    """
    Compute gradient for logistic regression. 
    
    Parameters
    ----------
    W : m parameters for logistic regression. 
    
    X : The input dataset of shape (n,m) where n is the total number
        of data points and m is the number of features. We assume the 
        intercept has already been added to the input.
    
    Y : Vector of labels for the input with n elements. 
    
    Returns
    -------
    grad : A vector with m values which is the gradient of the cost
        function with respect to W, at the current values of W.
        
    """
    # Initialize some useful values
    n = Y.size  # number of training examples

    # You need to return the following variables correctly 
    grad = np.zeros(W.shape)
    
    # ====================== CODE HERE ======================
    # NOTE: the diff between dot and @ is that dot allow nultiply matrix by scalar, @ does not
    #The equation is very basic conversion from the given equation above the question, still implementing elementwise 
    #Similar to above, I am using Elementiwise operation for ease of understanding while sacrificing compute and time
    m = W.shape[0]
    for i in range(n):
        z = np.dot(X[i] , W)
        h_x = sigmoid(z)
        for j in range(m):
            grad[j] += (h_x- Y[i]) * X[i,j]
    grad = grad / n

    #Matrix Wise Operation alternative, short and simple
    # h = sigmoid(np.dot(X, W))
    # grad = 1/n * np.dot(X.T, (h - Y))
    # =============================================================
    # print(f" H_X Size : {h_x.shape}")
    return grad

In [5]:
def logreg_gradient_descent(X, Y, W_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn W. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X                   : Data, n examples with m features
      Y                   : m target values
      W_in                : m initial model parameters  
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      W                : Updated values of parameters 
      """
    W = W_in
    
    # ====================== CODE HERE ======================

    for i in range(num_iters):
      grad = gradient_function(W, X, Y)
      W = W -  alpha * grad
      J = cost_function(W, X, Y) 
          
    # Add the cost function to the array
    # =============================================================
      
    return J, W

In [6]:
# initialize parameters
initial_W = np.array([-40.0]*X_train.shape[1])

# some gradient descent settings
iterations = 20000
alpha = 0.02

"""
Apply functions coded above to calculate final W, cost J and accuracy of predictions
Use given datasets and parameters
"""
J = 0
W = 0 
acc = 0

# ====================== CODE HERE ======================
# NOTE: to print correctly, W must be of shape (5,), J must be scalar float
J, W = logreg_gradient_descent(X_train, Y_train, initial_W, logreg_costFunction, logreg_GradFunction, alpha, iterations)
y_pred = np.round(sigmoid(np.dot(X_test, W)))
correct = np.sum(y_pred == Y_test)
acc = correct / len(Y_test)
# ===========================================================

print('Please copy the folowing result line to Question 3 "(sumW = )"')
print(np.round(np.sum(W), 2)) 
print('Please copy the folowing result line to Question 3 "(J = )"')
print(np.round(J,2))
print('Please copy the folowing result line to Question 3 "(Accuracy = )"')
print(np.round(acc,2))

Please copy the folowing result line to Question 3 "(sumW = )"
0.4
Please copy the folowing result line to Question 3 "(J = )"
0.56
Please copy the folowing result line to Question 3 "(Accuracy = )"
0.78
