##  Model: Multiple Linear Regression
$$ f_{\mathbf{w},b}(\mathbf{x}) =  w_0x_0 + w_1x_1 +... + w_{n-1}x_{n-1} + b $$
$$ f_{\mathbf{w},b}(\mathbf{x}) = \mathbf{w} \cdot \mathbf{x} + b $$ 

## Cost Function: Mean Squared Error (MSE)
$$J(\mathbf{w},b) = \frac{1}{2m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})^2 $$


## Batch Gradient Descent 
$$\begin{align*}
&\text{repeat until convergence:} \; \lbrace \\
&  \; \; \;w_j = w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \; & \text{for j := 0..n-1} \\ 
&  \; \; \;  \; \;b = b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b} \\
&\rbrace \text{simultaneous updates}
\end{align*}$$

$$
\begin{align}
\frac{\partial J(\mathbf{w},b)}{\partial w_j}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})x_{j}^{(i)} \\
\frac{\partial J(\mathbf{w},b)}{\partial b}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})
\end{align}
$$

In [1]:
import numpy as np

In [2]:
def func_x(X, w, b):
    """
    Calculates the predicted values for linear regression.

    Args:
        X (ndarray): Shape (m, n) Input features, where m is the number of training examples and n is the number of features.
        w (ndarray): Shape (n,) Model weights, one for each feature.
        b (scalar): Bias term, a single value added to the weighted sum.

    Returns:
        f_x (ndarray): Shape (m,) Predicted values for each training example.
    """
    # Number of Training Examples
    m = X.shape[0]
    
    # An Array Which Will Store the Predictions of y
    # One for Each Training Example
    f_x = np.zeros(m)
    
    for i in range(m):
        f_x[i] = np.dot(w, X[i]) + b
    
    return f_x

In [3]:
""" Usage: Multiple Linear Regression """

def func_x_vec(X, w, b):
    """
    Vectorized function to calculate predicted values for linear regression.

    Args:
        X (ndarray): Shape (m, n) Input features, where m is the number of training examples and n is the number of features.
        w (ndarray): Shape (n,) Model weights, one for each feature.
        b (scalar): Bias term, a single value added to the weighted sum of features.

    Returns:
        ndarray: Predicted values for each training example, shape (m,).
    """
    # Vectorized Calculation of Predictions
    return np.dot(X, w) + b

In [4]:
def compute_cost(X, y, w, b):
    """
    Computes the cost function for linear regression (J(w,b)).
    
    Args:
        x (ndarray): Shape (m,) Input to the Model 
        y (ndarray): Shape (m,) Label 
        w, b (scalar): Parameters of the Model
    
    Returns
        J (float): The cost of using w,b as the parameters for linear regression
                   to fit the data points in x and y
    """
    # Number of Training Examples
    m = X.shape[0]
    
    # Cost
    J = 0
    
    # For Each Row (Observation)
    for i in range(m):
        
        # Compute the Predicted Value
        f_wb_i = np.dot(w,X[i])+b
        
        # Compute the Error (Predicted Value - Target Value)
        error_i = (f_wb_i - y[i])
        
        # (Add Squared Error to the Cost Function)
        J += error_i**2
    
    # Divide the Sum of Squared Errors by (2 * m)
    J /= (2*m)
    
    return J

In [5]:
""" Usage: Multiple Linear Regression """

def compute_cost_vec(X, y, w, b):
    """
    Computes the cost function for linear regression (J(w,b)). 
    Vectorized.
    
    Args:
        x (ndarray): Shape (m,) Input to the Model 
        y (ndarray): Shape (m,) Label 
        w, b (scalar): Parameters of the Model
    
    Returns
        J (float): The cost of using w,b as the parameters for linear regression
                   to fit the data points in x and y
    """
    # Number of Training Examples
    m = X.shape[0]
    
    # An Array Which Will Store the Predictions of y
    f_x = np.dot(X, w) + b
    
    # Error Array (Predicted Value - Target Value)
    error_arr = f_x - y
    
    # Vectorized Cost Calculation
    J = np.sum(error_arr**2) / (2*m)
    
    return J

In [6]:
def compute_gradients(X, y, w, b):
    """
    Computes the Gradients for Linear Regression.
    
    Args:
      X (ndarray (m,n)): Data, m Examples with n Features
      y (ndarray (m,)) : Target Values
      w (ndarray (n,)) : Model Parameters  
      b (scalar)       : Model Parameter
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape
    
    # dJ/dw has a Shape of (1xn)
    dj_dw = np.zeros(n)
    dj_db = 0
    
    # For Each Training Example m
    for i in range(m):
        # Calculate the Error
        error_i = (np.dot(w, X[i]) + b) - y[i]
    
        # For Each Feature n
        for j in range(n):
            # Update the Gradient of the Corresponding Weight w[j] 
            # by Adding the Product of the Error and the Current Feature Value
            dj_dw[j] += error_i * X[i, j]
        
        # Update the dJ/db by Adding the Corresponding Row's Error
        dj_db += error_i
    
    dj_dw /= m
    dj_db /= m
    
    return dj_dw, dj_db

In [7]:
""" Usage: Multiple Linear Regression """

def compute_gradients_vec(X, y, w, b):
    """
    Computes the Gradients for Linear Regression.
    Vectorized.
    
    Args:
      X (ndarray (m,n)): Data, m Examples with n Features
      y (ndarray (m,)) : Target Values
      w (ndarray (n,)) : Model Parameters  
      b (scalar)       : Model Parameter
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape
    
    # The Predictions of y
    f_x = np.dot(X, w) + b
    
    # Error
    error = f_x - y
    # For each feature across all training examples, multiply its value by how much 
    # off our prediction was (the error), sum this up for each feature, and then average it.
    dj_dw = np.dot(X.T, error) / m
    dj_db = np.sum(error) / m
    
    
    return dj_dw, dj_db

In [8]:
def gradient_descent(X, y, w_init, b_init, alpha, num_iters, compute_cost, compute_gradients):
    """
    Performs batch gradient descent to learn w and b. Updates w and b by taking 
    num_iters gradient steps with learning rate alpha.
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      J_hist (List): History of cost values
      p_hist (list): History of parameters [w,b] 
    """
    
    w = w_init
    b = b_init
    
    # Store Cost (J) and Parameters (w,b)
    J_hist = []
    p_hist = []
    
    for i in range(num_iters):
        # Calculate the Partial Derivatives wrt. the Parameters
        dj_dw, dj_db = compute_gradients(X, y, w, b)
        
        # Update the Parameters
        w = w - (alpha * dj_dw)
        b = b - (alpha * dj_db)
        
        # Save Cost J at Each Iteration Less than 100000 to Prevent Resource Exhaustion
        if i<100000:      
            cost = compute_cost(X, y, w, b)
            J_hist.append(cost)
            p_hist.append([w,b])
        
        # Print the Cost
        if i % math.ceil(num_iters/10) == 0:
            print(f"Iteration {i:4}: Cost {J_hist[-1]:0.2e} w:{w} b:{b:0.5e}")
        
    return w, b, J_hist, p_hist

## Model: L2 Regularized (Ridge) Linear Regression
$$ f_{\mathbf{w},b}(\mathbf{x}) =  w_0x_0 + w_1x_1 +... + w_{n-1}x_{n-1} + b $$
$$ f_{\mathbf{w},b}(\mathbf{x}) = \mathbf{w} \cdot \mathbf{x} + b $$ 

## Cost Function: Regularized Mean Squared Error (MSE)
$$J(\mathbf{w},b) = \frac{1}{2m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})^2  + \frac{\lambda}{2m}  \sum_{j=0}^{n-1} w_j^2 $$

## Regularized Batch Gradient Descent
$$\begin{align*}
&\text{repeat until convergence:} \; \lbrace \\
&  \; \; \;w_j = w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \; & \text{for j := 0..n-1} \\ 
&  \; \; \;  \; \;b = b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b} \\
&\rbrace \text{simultaneous updates}
\end{align*}$$

$$\frac{\partial J(\mathbf{w},b)}{\partial w_j} = \left( \frac{1}{m}  \sum_{i=0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)}) x_j^{(i)} \right) + \frac{\lambda}{m} w_j  \quad\, \mbox{for $j=0...(n-1)$}$$
$$\frac{\partial J(\mathbf{w},b)}{\partial b} = \frac{1}{m}  \sum_{i=0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})$$

In [9]:
def compute_cost_linear_reg(X, y, w, b, lambda_ = 1):
    """
    Computes the L2-Regularized MSE Cost over All Examples.
    
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns:
      total_cost (scalar):  cost 
    """

    m  = X.shape[0]
    n  = len(w)
    cost = 0.
    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b                                   
        cost = cost + (f_wb_i - y[i])**2                              
    cost = cost / (2 * m)                                             
 
    reg_cost = 0
    for j in range(n):
        reg_cost += (w[j]**2)                                          
    reg_cost = reg_cost * (lambda_/(2*m))                    
    
    total_cost = cost + reg_cost                                    
    return total_cost 

In [10]:
""" Usage: Multiple Linear Regression """

def compute_cost_linear_reg_vec(X, y, w, b, lambda_ = 1):
    """
    Computes the L2-Regularized MSE Cost over All Examples.
    Vectorized
    
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns:
      total_cost (scalar):  cost 
    """
    # Number of Training Examples
    m = X.shape[0]
    
    # An Array Which Will Store the Predictions of y
    f_x = np.dot(X, w) + b
    
    # Error Array (Predicted Value - Target Value)
    error_arr = f_x - y
    
    # Vectorized Cost Calculation for Linear Regression
    J = np.sum(error_arr**2) / (2*m)
    
    # Regularization Term (Excluding the Bias Term)
    reg_cost = np.sum(w**2) * (lambda_ / (2*m))
    
    # Total Cost with Regularization
    total_cost = J + reg_cost
    
    return total_cost

In [11]:
def compute_gradient_linear_reg(X, y, w, b, lambda_): 
    """
    Computes the Gradients for L2-Regularized Linear Regression.
    
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    m,n = X.shape           
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):                             
        err = (np.dot(X[i], w) + b) - y[i]                 
        for j in range(n):                         
            dj_dw[j] = dj_dw[j] + err * X[i, j]               
        dj_db = dj_db + err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m   
    
    for j in range(n):
        dj_dw[j] = dj_dw[j] + (lambda_/m) * w[j]

    return dj_db, dj_dw

In [12]:
""" Usage: Multiple Linear Regression """

def compute_gradient_linear_reg_vec(X, y, w, b, lambda_):
    """
    Computes the Gradients for L2-Regularized Linear Regression.
    Vectorized.
    
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape  

    predictions = np.dot(X, w) + b

    error = predictions - y

    dj_dw = (np.dot(X.T, error) + lambda_ * w) / m
    dj_db = np.sum(error) / m

    return dj_db, dj_dw

## Model: Logistic Regression
#### g = Sigmoid Function (Logistic Function)
$$
\begin{align}
  f_{\mathbf{w},b}(\mathbf{x^{(i)}}) &= g(z^{(i)}) \\
  z^{(i)} &= \mathbf{w} \cdot \mathbf{x}^{(i)}+ b \\
  g(z^{(i)}) &= \frac{1}{1+e^{-z^{(i)}}}
\end{align}
$$
  
$$f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = P(y=1 \mid \mathbf{x}; \mathbf{w}, b)$$
If $f_{\mathbf{w},b}(x) >= 0.5$, Predict $y=1$

If $f_{\mathbf{w},b}(x) < 0.5$, Predict $y=0$

#### Decision Boundary: $z = \mathbf{w} \cdot \mathbf{x} + b = 0$

  If $\mathbf{w} \cdot \mathbf{x} + b >= 0$, the Model Predicts $y=1$
  
  If $\mathbf{w} \cdot \mathbf{x} + b < 0$, the Model Predicts $y=0$
 
## Cost Function:  Log-Loss (Binary Cross-Entropy Loss)
$$J(\mathbf{w},b) = \frac{1}{m}  \sum_{i=0}^{m-1} \left[ -y^{(i)} \log\left(f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) - \left( 1 - y^{(i)}\right) \log \left( 1 - f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) \right]
$$

## Batch Gradient Descent
$$\begin{align*}
&\text{repeat until convergence:} \; \lbrace \\
&  \; \; \;w_j = w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \; & \text{for j := 0..n-1} \\ 
&  \; \; \;  \; \;b = b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b} \\
&\rbrace \text{simultaneous updates}
\end{align*}$$

$$\begin{align*}
\frac{\partial J(\mathbf{w},b)}{\partial w_j}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})x_{j}^{(i)} \\
\frac{\partial J(\mathbf{w},b)}{\partial b}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})
\end{align*}$$

In [13]:
def sigmoid(z):
    """
    Compute the Sigmoid of z.

    Args:
        z (ndarray): A scalar, numpy array of any size.

    Returns:
        g (ndarray): sigmoid(z), with the same shape as z
    """
    g = 1/(1+np.exp(-z))
    
    return g

In [14]:
def compute_cost(X, y, w, b):
    """
    Computes Cost (Log Loss/ Binary Cross-Entropy Loss).

    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      cost (scalar): cost
    """

    m = X.shape[0]
    cost = 0.0
    for i in range(m):
        z_i = np.dot(X[i],w) + b
        f_wb_i = sigmoid(z_i)
        cost +=  -y[i]*np.log(f_wb_i) - (1-y[i])*np.log(1-f_wb_i)
             
    cost = cost / m
    return cost

In [15]:
""" Usage: Multiple Features """

def compute_cost_vec(X, y, w, b):
    """
    Computes Cost (Log Loss/ Binary Cross-Entropy Loss).
    Vectorized.

    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar): model parameter
      
    Returns:
      cost (scalar): cost
    """

    m = X.shape[0]

    z = np.dot(X, w) + b
    f_wb = sigmoid(z)

    cost = -np.sum(y * np.log(f_wb) + (1 - y) * np.log(1 - f_wb)) / m

    return cost

In [16]:
def compute_gradient_logistic_reg(X, y, w, b, lambda_): 
    """
    Computes the Gradients for Logistic Regression.
 
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns
      dj_dw (ndarray Shape (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar)            : The gradient of the cost w.r.t. the parameter b. 
    """
    m,n = X.shape
    dj_dw = np.zeros((n,))                            
    dj_db = 0.0                                       

    for i in range(m):
        f_wb_i = sigmoid(np.dot(X[i],w) + b)         
        err_i  = f_wb_i  - y[i]                      
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err_i * X[i,j]      
        dj_db = dj_db + err_i
    dj_dw = dj_dw/m                                   
    dj_db = dj_db/m                                   

    return dj_db, dj_dw  

In [17]:
""" Usage: Multiple Features """

def compute_gradient_logistic_reg_vec(X, y, w, b, lambda_):
    """
    Computes the Gradients for L2-Regularized Logistic Regression.
    Vectorized.
 
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns
      dj_dw (ndarray Shape (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar)            : The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape 

    z = np.dot(X, w) + b
    f_wb = sigmoid(z)

    error = f_wb - y
    
    dj_dw = np.dot(X.T, error) / m
    dj_db = np.sum(error) / m

    return dj_db, dj_dw

## Model: L2 Regularized (Ridge) Logistic Regression
#### g = Sigmoid Function (Logistic Function)
$$
\begin{align}
  f_{\mathbf{w},b}(\mathbf{x^{(i)}}) &= g(z^{(i)}) \\
  z^{(i)} &= \mathbf{w} \cdot \mathbf{x}^{(i)}+ b \\
  g(z^{(i)}) &= \frac{1}{1+e^{-z^{(i)}}}
\end{align}
$$
  
$$f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = P(y=1 \mid \mathbf{x}; \mathbf{w}, b)$$
If $f_{\mathbf{w},b}(x) >= 0.5$, Predict $y=1$

If $f_{\mathbf{w},b}(x) < 0.5$, Predict $y=0$

#### Decision Boundary: $z = \mathbf{w} \cdot \mathbf{x} + b = 0$

  If $\mathbf{w} \cdot \mathbf{x} + b >= 0$, the Model Predicts $y=1$
  
  If $\mathbf{w} \cdot \mathbf{x} + b < 0$, the Model Predicts $y=0$

## Cost Function:  Regularized Log-Loss (Regularized Binary Cross-Entropy Loss)
$$J(\mathbf{w},b) = \frac{1}{m}  \sum_{i=0}^{m-1} \left[ -y^{(i)} \log\left(f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) - \left( 1 - y^{(i)}\right) \log \left( 1 - f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) \right] + \frac{\lambda}{2m}  \sum_{j=0}^{n-1} w_j^2$$

## Regularized Batch Gradient Descent 
$$\begin{align*}
&\text{repeat until convergence:} \; \lbrace \\
&  \; \; \;w_j = w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \; & \text{for j := 0..n-1} \\ 
&  \; \; \;  \; \;b = b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b} \\
&\rbrace \text{simultaneous updates}
\end{align*}$$

$$\frac{\partial J(\mathbf{w},b)}{\partial w_j} = \left( \frac{1}{m}  \sum_{i=0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)}) x_j^{(i)} \right) + \frac{\lambda}{m} w_j  \quad\, \mbox{for $j=0...(n-1)$}$$
$$\frac{\partial J(\mathbf{w},b)}{\partial b} = \frac{1}{m}  \sum_{i=0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})$$

In [18]:
def compute_cost_logistic_reg(X, y, w, b, lambda_ = 1):
    """
     Computes the L2-Regularized Log-Loss Cost over All Examples.
     
    Args:
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns:
      total_cost (scalar):  cost 
    """

    m,n  = X.shape
    cost = 0.
    for i in range(m):
        z_i = np.dot(X[i], w) + b                                      
        f_wb_i = 1 / (1 + np.exp(-z_i))                                       
        cost +=  -y[i]*np.log(f_wb_i) - (1-y[i])*np.log(1-f_wb_i)     
             
    cost = cost/m                                                      

    reg_cost = 0
    for j in range(n):
        reg_cost += (w[j]**2)                                          
    reg_cost = (lambda_/(2*m)) * reg_cost                              
    
    total_cost = cost + reg_cost                                      
    return total_cost 

In [19]:
""" Usage: Multiple Features """

def compute_cost_logistic_reg_vec(X, y, w, b, lambda_ = 1):
    """
     Computes the L2-Regularized Log-Loss Cost over All Examples.
     Vectorized.
     
    Args:
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns:
      total_cost (scalar):  cost 
    """
    m, n = X.shape

    z = np.dot(X, w) + b
    f_wb = 1 / (1 + np.exp(-z))

    cost = -np.sum(y * np.log(f_wb) + (1 - y) * np.log(1 - f_wb)) / m

    reg_cost = np.sum(w**2) * (lambda_ / (2 * m))

    total_cost = cost + reg_cost

    return total_cost

In [20]:
def compute_gradient_logistic_reg(X, y, w, b, lambda_): 
    """
    Computes the Gradients for L2-Regularized Logistic Regression.
 
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns
      dj_dw (ndarray Shape (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar)            : The gradient of the cost w.r.t. the parameter b. 
    """
    m,n = X.shape
    dj_dw = np.zeros((n,))                            
    dj_db = 0.0                                       

    for i in range(m):
        f_wb_i = sigmoid(np.dot(X[i],w) + b)         
        err_i  = f_wb_i  - y[i]                      
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err_i * X[i,j]      
        dj_db = dj_db + err_i
    dj_dw = dj_dw/m                                   
    dj_db = dj_db/m                                   

    for j in range(n):
        dj_dw[j] = dj_dw[j] + (lambda_/m) * w[j]

    return dj_db, dj_dw  

In [21]:
""" Usage: Multiple Features """

def compute_gradient_logistic_reg_vec(X, y, w, b, lambda_):
    """
    Computes the Gradients for L2-Regularized Logistic Regression.
    Vectorized.
 
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,)): target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      lambda_ (scalar): Controls amount of regularization
    Returns
      dj_dw (ndarray Shape (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar)            : The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape 

    z = np.dot(X, w) + b
    f_wb = sigmoid(z)

    error = f_wb - y
    
    dj_dw = (np.dot(X.T, error) + lambda_ * w) / m
    dj_db = np.sum(error) / m

    return dj_db, dj_dw

## Z-Score Normalization ## 

$$x^{(i)}_j = \dfrac{x^{(i)}_j - \mu_j}{\sigma_j}$$ 

$$
\begin{align}
\mu_j &= \frac{1}{m} \sum_{i=0}^{m-1} x^{(i)}_j \\
\sigma^2_j &= \frac{1}{m} \sum_{i=0}^{m-1} (x^{(i)}_j - \mu_j)^2  
\end{align}
$$

In [22]:
def zscore_normalization(X):
    """
    Computes  X, Z-Score Normalized by column.
    After Z-Score Normalization, all features will have a mean of 0 and a standard deviation of 1.
    
    Args:
      X (ndarray (m,n))     : input data, m examples, n features
      
    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # Find the Mean of Each Column/Feature
    # mu Will Have a Shape of (n,)
    mu = np.mean(X, axis=0)                 
    
    # Find the Standard Deviation of Eeach Column/Feature
    # sigma Will Have a Shape of (n,)
    sigma  = np.std(X, axis=0) 
    
    # Element-Wise,Subtract mu for That Column from Each Example, then Divide by std for That Column
    X_norm = (X - mu) / sigma      

    return X_norm, mu, sigma

## Model: K-Means Algorithm
The K-means algorithm is a method to automatically cluster similar
data points together. 

* You are given a training set $\{x^{(1)}, ..., x^{(m)}\}$, and you want to group the data into a few cohesive “clusters”. 


* K-means is an iterative procedure that
     * Starts by guessing the initial centroids, and then 
     * Refines this guess by 
         * Repeatedly assigning examples to their closest centroids, and then 
         * Recomputing the centroids based on the assignments.
         

* In pseudocode, the K-means algorithm is as follows:

    ``` python
    # Initialize centroids
    # K is the number of clusters
    centroids = kMeans_init_centroids(X, K)
    
    for iter in range(iterations):
        # Cluster assignment step: 
        # Assign each data point to the closest centroid. 
        # idx[i] corresponds to the index of the centroid 
        # assigned to example i
        idx = find_closest_centroids(X, centroids)

        # Move centroid step: 
        # Compute means based on centroid assignments
        centroids = compute_centroids(X, idx, K)
    ```
    
    
* The inner-loop of the algorithm repeatedly carries out two steps: 
    1. Assigning each training example $x^{(i)}$ to its closest centroid, and
    2. Recomputing the mean of each centroid using the points assigned to it. 
    
    
* The $K$-means algorithm will always converge to some final set of means for the centroids. 

* However, the converged solution may not always be ideal and depends on the initial setting of the centroids.
    * Therefore, in practice the K-means algorithm is usually run a few times with different random initializations. 
    * One way to choose between these different solutions from different random initializations is to choose the one with the lowest cost function value (distortion).

#### Finding Closest Centroids

In the “cluster assignment” phase of the K-means algorithm, the
algorithm assigns every training example $x^{(i)}$ to its closest
centroid, given the current positions of centroids. 

* This function takes the data matrix `X` and the locations of all
centroids inside `centroids` 
* It should output a one-dimensional array `idx` (which has the same number of elements as `X`) that holds the index  of the closest centroid (a value in $\{0,...,K-1\}$, where $K$ is total number of centroids) to every training example . *(Note: The index range 0 to K-1 varies slightly from what is shown in the lectures (i.e. 1 to K) because Python list indices start at 0 instead of 1)*
* Specifically, for every example $x^{(i)}$ we set
$$c^{(i)} := j \quad \mathrm{that \; minimizes} \quad ||x^{(i)} - \mu_j||^2,$$
where 
 * $c^{(i)}$ is the index of the centroid that is closest to $x^{(i)}$ (corresponds to `idx[i]` in the starter code), and 
 * $\mu_j$ is the position (value) of the $j$’th centroid. (stored in `centroids` in the starter code)
 * $||x^{(i)} - \mu_j||$ is the L2-norm

#### Computing Centroid Means

Given assignments of every point to a centroid, the second phase of the
algorithm recomputes, for each centroid, the mean of the points that
were assigned to it.

* Specifically, for every centroid $\mu_k$ we set
$$\mu_k = \frac{1}{|C_k|} \sum_{i \in C_k} x^{(i)}$$ 

    where 
    * $C_k$ is the set of examples that are assigned to centroid $k$
    * $|C_k|$ is the number of examples in the set $C_k$


* Concretely, if two examples say $x^{(3)}$ and $x^{(5)}$ are assigned to centroid $k=2$,
then you should update $\mu_2 = \frac{1}{2}(x^{(3)}+x^{(5)})$.

In [23]:
def find_closest_centroids(X, centroids):
    """
    Computes the Centroid Memberships for Every Example.
    
    Args:
        X (ndarray): (m, n) Input values      
        centroids (ndarray): (K, n) centroids
    
    Returns:
        idx (array_like): (m,) closest centroids
    
    """
    # Set K
    K = centroids.shape[0]
    idx = np.zeros(X.shape[0], dtype=int)
    
    # Number of Training Examples
    m = X.shape[0]
    
    for i in range(m):
        # An Array that Stores the Distance between the Example i and Each Centroid 
        distance_i = np.zeros(K)
        
        # Loop the Example i, K (Number of Centroids) Times and Find the L2-Norm (Squared)
        for j in range(K):
            distance_i[j] = np.sum(np.square(X[i] - centroids[j]))
        
        # Closest Centroid for the Example i is the Centroid that Gives the Lowest L2-Norm (Squared)
        idx[i] = np.argmin(distance_i)
            
    return idx

In [24]:
def compute_centroids(X, idx, K):
    """
    Returns the new centroids by computing the means of the 
    data points assigned to each centroid.
    
    Args:
        X (ndarray):   (m, n) Data points
        idx (ndarray): (m,) Array containing index of closest centroid for each 
                       example in X. Concretely, idx[i] contains the index of 
                       the centroid closest to example i
        K (int):       number of centroids
    
    Returns:
        centroids (ndarray): (K, n) New centroids computed
    """
    m, n = X.shape
    centroids = np.zeros((K, n))
    
    
    # Loop through Each Centroid k
    for k in range(K):
        # Each Centroid has n Features 
        updated_centroid_k = np.zeros(n)
        
        # Number of Examples in Centroid k
        num_exampl_k = np.sum(idx == k)
        
        # Loop through the Data Set, if an Example belongs to Centroid k, Add it to the updated_centroid_k
        for i in range(m):
            if idx[i] == k:
                updated_centroid_k += X[i]
        
        # Formula
        updated_centroid_k = updated_centroid_k / num_exampl_k
        
        # Update the Centroid
        centroids[k] = updated_centroid_k
            
    return centroids

In [25]:
def run_kMeans(X, initial_centroids, max_iters=10):
    """
    Runs the K-Means Algorithm on Data Matrix X, Where Each Row of X
    is a Single Example.
    """
    # Set m, n and K (Number of Clusters)
    m, n = X.shape
    K = initial_centroids.shape[0]
    
    centroids = initial_centroids
    idx = np.zeros(m)

    # Run K-Means
    for i in range(max_iters):
        
        #Output Progress
        print("K-Means Iteration %d/%d" % (i, max_iters-1))
        
        # For Each Example in X, Assign it to the Closest Centroid
        idx = find_closest_centroids(X, centroids)
        
        # Given the Memberships, Compute New Centroids
        centroids = compute_centroids(X, idx, K)
        
    return centroids, idx

In [26]:
def kMeans_init_centroids(X, K):
    """
    This function initializes K centroids that are to be 
    used in K-Means on the dataset X.
    
    Args:
        X (ndarray): Data points 
        K (int):     number of centroids/clusters
    
    Returns:
        centroids (ndarray): Initialized centroids
    """
    
    # Randomly Re-Order the Indices of Examples
    randidx = np.random.permutation(X.shape[0])
    
    # Take the First K Examples as Centroids
    centroids = X[randidx[:K]]
    
    return centroids

In [27]:
# Implementation
''' 
# Set K and max_iters
K = 3
max_iters = 10

# Set Initial Centroids by Picking Random Examples from the Dataset
initial_centroids = kMeans_init_centroids(X, K)

# Run K-Means
centroids, idx = run_kMeans(X, initial_centroids, max_iters)
'''

' \n# Set K and max_iters\nK = 3\nmax_iters = 10\n\n# Set Initial Centroids by Picking Random Examples from the Dataset\ninitial_centroids = kMeans_init_centroids(X, K)\n\n# Run K-Means\ncentroids, idx = run_kMeans(X, initial_centroids, max_iters)\n'