<a href="https://colab.research.google.com/github/casalazarb/logistic_regression/blob/main/logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [139]:
import numpy as np
import pandas as pd
#to load csv file to google colab
from google.colab import drive

In [140]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [141]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/loan.csv')

In [142]:
print(df)

    default    rate  log_income  fico
0         1  0.1496   10.714418   667
1         1  0.1114   11.002100   722
2         1  0.1343   11.884489   682
3         1  0.1059   10.433822   687
4         1  0.1501   12.269047   677
5         1  0.0964   11.225243   772
6         1  0.1280   10.373491   682
7         1  0.1122   10.596635   712
8         1  0.0964   11.472103   737
9         1  0.1154    9.615805   672
10        1  0.1501   11.472103   702
11        1  0.1217   11.156251   672
12        1  0.1375   11.038110   677
13        1  0.1249   10.571317   667
14        1  0.1028   10.727663   687
15        0  0.1189   11.350407   737
16        0  0.1071   11.082143   707
17        0  0.1357   10.373491   682
18        0  0.1008   11.350407   712
19        0  0.1426   11.299732   667
20        0  0.0788   11.904968   727
21        0  0.1134   11.407565   682
22        0  0.1221   10.203592   707
23        0  0.1347   10.434116   677
24        0  0.1324   11.835009   662
25        0 

In [143]:
X_train = np.array([df['rate'], df['log_income'], df['fico']])
print(X_train.shape)

(3, 30)


In [144]:
Y_train = np.array([df['default']])
print(Y_train.shape)

(1, 30)


In [145]:
#sigmoid function

def sigmoid(x_dot_thetas):
    """
    Sigmoid of x_dot_thetas

    Arguments:
    x_dot_thetas: dot product of the value of the variables of each example with the parameters

    Return:
    sigmoid(x_dot_thetas)
    """

    sigmoid = 1/(1+np.exp(-x_dot_thetas))
   
    return sigmoid

In [146]:
def initialize_with_zeroes(n_param):
    """
    Intialize the parameters with zeros

    Argument:
    n_param: number of parameters minus the constant
    
    Returns:
    thetas: initialized vector with size number of rows and one column
    theta_0: constant term
    """
    
    thetas = np.zeros((n_param, 1))
    theta_0 = 0

    return thetas, theta_0


In [147]:
def propagate(thetas, theta_0, X, Y):
    """
    Arguments:
    thetas: parameters estimated in the regression
    theta_0: constant term of the regression
    X: matrix with observations to train the algorithm size of the matriz (n, number of parameters)
    Y: label vector for a binary classification

    Return:
    cost: value of the cost function given the parameters and the obervations o examples, it has to be minimized
    d_thetas: derivative of J (cost function) with respect to thetas, fundamental part in gradient descent
    d_theta_0: derivative of J with respect to theta_0
  
    """
    
    #n number of observations in matrix X
    n = X.shape[1]
    
    #p is the probability that an example belongs to category or label 1
    p = sigmoid(np.dot(np.transpose(thetas), X) + theta_0)
    cost = -1/n * np.sum(Y * np.log(p) + (1-Y) * np.log(1-p))
    
    #derivatives of the cost function with respect to the parameters
    d_thetas = 1/n * np.dot(X,np.transpose(p-Y))
    d_theta_0 = 1/n * np.sum(p-Y)

    cost = np.squeeze(cost)
        
    grads = {"d_thetas": d_thetas,
             "d_theta_0": d_theta_0}
    
    return grads, cost

In [148]:

def optimize(thetas, theta_0, X, Y, num_iterations, learning_rate, print_cost = False):
    """
    This function optimizes w and b by running a gradient descent algorithm
    
    Arguments:
    thetas: parameters estimated in the regression
    theta_0: constant term of the regression
    X: matrix with observations to train the algorithm size of the matriz (n, number of parameters)
    Y: label vector for a binary classification
    num_iterations: iterations for the optimization loop
    learning_rate: learning rate of the gradient descent update rule
    print_cost: True to print the loss every 100 steps
    
    Returns:
    params: dictionary containing the parameters thetas (size number of explicative variables) and constant term theta_0
    grads: dictionary containing the gradients of the weights and bias with respect to the cost function
    costs: list of the costs computed during the optimization, hopefully it must decrease consistently
    """
    
    costs = []
  
    for iteration in range(num_iterations):        
        
        grads, cost = propagate(thetas, theta_0, X, Y)

        # Retrieve derivatives from grads
        d_thetas = grads["d_thetas"]
        d_theta_0 = grads["d_theta_0"]
        #update gradient descent
        thetas -= learning_rate * d_thetas
        theta_0 -= learning_rate * d_theta_0
      
        # Record the costs
        if iteration % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 100 training examples
        if print_cost and iteration % 100 == 0:
            print ("Cost after iteration {}: {}".format(iteration, cost))
    
    params = {"thetas": thetas,
              "theta_0": theta_0}
    
    grads = {"d_thetas": d_thetas,
             "d_theta_0": d_theta_0}
    
    return params, grads, costs

In [149]:
def predict(thetas, theta_0, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
    
    Arguments:
    thetas: array of parameters
    theta_0: the constant of the regression
    X: data matrix with explicative variables
    
    Returns:
    Y_pred: prediction 
    '''
    
    n = X.shape[1]
    Y_pred = np.zeros((1,n))
    thetas = thetas.reshape(X.shape[0], 1)
    
    #compute the probability of an example to belong to the category 1
    p = sigmoid(np.dot(np.transpose(thetas), X) + theta_0)
  
    
    for index in range(p.shape[1]):
        
        #give the predicted category 0 or 1 depending on the probability and a threshold
        if p[0, index] > 0.5:
            Y_pred[0, index] = 1
        else:
            Y_pred[0, index] = 0
    
    return p, Y_pred

In [150]:
def logistic_regression(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False):
    """
    Integrates all the parts of the logistic regression model
    
    Arguments:
    X_train: training set of the explicative variables size or the array (number of parameters - 1, n)
    Y_train: training set of the  labels dimension n
    X_test: testing set X
    Y_test: testing set Y
    num_iterations: for the optimization algorithm
    learning_rate: to update gradient descent
    print_cost: information for the user behavior of the cost function, hopefully decreasing
    
    Returns:
    output: dictionary containing information about the model, parameters, predictions.
    """
    
    # initialize parameters with zeros
    thetas, theta_0 = initialize_with_zeroes(X_train.shape[0])

    # Gradient descent
    parameters, grads, costs = optimize(thetas, theta_0, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    #store the parameters of the model
    thetas = parameters["thetas"]
    theta_0 = parameters["theta_0"]
    
    #Predict group of test/train 
    Y_predgroup_train = predict(thetas, theta_0, X_train)[1]
    Y_predgroup_test = predict(thetas, theta_0, X_test)[1]


    #Predict probability of belonging to group 1 for test/train
    Y_predprob_train = predict(thetas, theta_0, X_train)[0]
    Y_predprob_test = predict(thetas, theta_0, X_test)[0]

    # Print train/test Errors
    print("train accuracy: {} %".format((1 - np.mean(np.abs(Y_predgroup_train - Y_train))) * 100))
    print("test accuracy: {} %".format((1 - np.mean(np.abs(Y_predgroup_test - Y_test))) * 100))

    
    output = {"costs": costs,
              "thetas" : thetas, 
              "theta_0" : theta_0,
              "learning_rate" : learning_rate,
              "num_iterations": num_iterations}
    
    return output

In [None]:
output = logistic_regression(X_train, Y_train, X_train, Y_train, num_iterations = 10, learning_rate = 0.07)
print(output)