# Logistic Regression From Scratch

In [278]:
from typing import Any

import numpy as np
from numpy.typing import NDArray

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [279]:
raw_x, raw_y = make_classification(n_features=10,n_samples=1000, random_state=3442)

In [280]:
raw_x.shape, raw_y.shape

((1000, 10), (1000,))

In [281]:
raw_x[:5]

array([[-0.56573764, -1.12659397, -1.58488824,  0.047902  ,  0.60861322,
        -1.23263319, -0.36952168, -0.5890378 ,  0.12181396, -1.57697136],
       [ 0.00678233, -0.84560635, -0.41794588, -1.02328482, -0.51747583,
         0.1463818 ,  0.78425481,  0.07079281,  0.0927115 ,  0.24143793],
       [ 0.75739895,  1.21942221,  1.37968123, -0.3908939 , -0.88958722,
         0.86788148, -0.26789949, -0.40195265,  1.06290301,  1.08675515],
       [-1.14172981,  1.48598207,  0.70375216,  0.62227436,  1.39281664,
        -0.29987348,  0.58863519, -0.83229884,  0.40897603, -0.48098127],
       [ 1.84484214, -1.11423938, -0.12115165, -1.31060452, -0.4074719 ,
         0.78941927, -0.59384031, -0.86682628,  0.70957145,  1.11146838]])

In [282]:
x_train, x_test, y_train, y_test = train_test_split(raw_x, raw_y, random_state=3453, test_size=.2)
(x_train.shape, y_train.shape), (x_test.shape, y_test.shape)

(((800, 10), (800,)), ((200, 10), (200,)))

In [283]:
class LogisticRegression:
    """implement logistic regression from scratch"""

    def __init__(self, learning_rate: float = 0.001, epochs: int = 500) -> None:
        """initializing model & hyper parameters"""
        
        self.learning_rate = learning_rate
        self.epochs = epochs  
        self.bias = None 
        self.weight= None

    @staticmethod
    def linear_equation(x: NDArray[np.float64], w: NDArray[np.float64], b: float) -> float:
        if x.shape[1] != w.shape[0]:
            raise ValueError("X and W are mismatched column count")

        logits = np.dot(x, w) + b
        return  logits
        
    @staticmethod
    def sigmoid(z: float) -> float:
        return (1/(1+np.exp(-z)))

    @staticmethod
    def bce(y_true: NDArray, y_pred: NDArray) -> float:
        summation = 0
        for y_t, y_p in zip(y_true, y_pred):
            # log_loss = (y * log(y_cap)) + (1-y) * (1-log(Y_cap)) 
            summation += (y_t * y_p) + ((1 - y_t) * (1 - np.log(y_p)))
            
        return np.mean(summation)

    @staticmethod
    def get_dw(y_true: NDArray[int], y_prob: NDArray[float], x: NDArray[float]) -> float:
        """generate partial derivative of loss with respect to weight
        
            Equation:
                dw = 1/n . y_prob_i - y_true_i . x_i
        """
        error = y_prob - y_true
        dw = []
        for each in x.T:
            result = np.mean([np.round(a*b, 3) for a, b in zip(error, each)])
            dw.append(result)
        return np.array(dw)


    @staticmethod
    def get_db(y_true: NDArray[int], y_prob: NDArray[float]) -> float:
        """generate partial derivative of loss with respect to bias"""
        error = y_prob - y_true
        return np.mean(error)
        

    def update_params(self, y_prob: NDArray, y_true: NDArray, x: NDArray) -> tuple[float, float]:
        """
        Returns:
            tuple[float, float] : (weight, bias)
        """
        new_weight = self.weight - (self.learning_rate * self.get_dw(y_prob=y_prob, y_true=y_true, x=x))
        new_bias = self.bias - (self.learning_rate * self.get_db(y_prob=y_prob, y_true=y_true))
        return (new_weight, new_bias)
        

    def fit(self, x: NDArray[np.float64], y: NDArray[np.int16]) -> Any:
        """train the model

        Args:
            x: (ndarray) : Feature matrix with shape (n_samples, n_features)
            y: (ndarray) : Target matrix with shape (n_samples,)

        Return:
            Any

        Raise:
            ValueError : if x and y have mismatched length

        """
        if x.shape[0] != y.shape[0]:
            raise ValueError(f"x and y have mismatched shape x: {x.shape[0]} y:{x.shape[0]}")

        self.weight = np.random.randn(x.shape[1]) * .01
        self.bias = 0

        for _ in range(self.epochs):
            logits = self.linear_equation(x=x, w=self.weight, b=self.bias)
            prob = [self.sigmoid(each) for each in logits]

            self.weight, self.bias = self.update_params(x=x, y_prob=prob, y_true=y)
        print("training completed")

    def predict(self, x: np.ndarray) -> Any:
        pred_logits = self.linear_equation(x=x, w=self.weight, b=self.bias)
        pred_prob = [self.sigmoid(each) for each in pred_logits]

        result = list(map(lambda x: 1 if x > .5 else 0, pred_prob))
        return np.array(result)

In [284]:
model = LogisticRegression()

model.fit(x=x_train, y=y_train)

training completed


In [285]:
model.weight

array([ 0.00186504,  0.21134891,  0.13814837,  0.00939806, -0.01274474,
       -0.00626272, -0.00135388,  0.00040708,  0.01708384,  0.01101925])

In [286]:
model.bias

np.float64(0.004710960800480845)

In [287]:
y_pred = model.predict(x=x_test)
y_pred

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0])

In [288]:
y_test

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0])

In [289]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f"TN : {tn}\nFP : {fp}\nFN : {fn}\nTP : {tp}")

TN : 103
FP : 3
FN : 12
TP : 82


In [290]:
test_acc = accuracy_score(y_test, y_pred) * 100
print(f"Test Accuracy is {test_acc}")

Test Accuracy is 92.5


In [291]:
y_pred_train = model.predict(x_train)
train_acc = accuracy_score(y_train, y_pred_train) * 100
print(f"Train Accuracy is {train_acc}")

Train Accuracy is 90.875
