In this assignment, you will implement a logistic regression model from scratch for binary classification.

Libraries

In [158]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time
import math


from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, auc, log_loss
import matplotlib.pyplot as plt


from sklearn.datasets import load_wine

Pulling Data Set In

In [159]:
# loading wine data in from sklearn datasets
wine = load_wine()

# converting data into pandas df, only including feature columns
df = pd.DataFrame(wine.data, columns=wine.feature_names)

Data Prep

In [160]:
# creating target variable
df['target'] = wine.target

# filtering to only include classes 0 and 1 for binary classification
df_binary = df[df['target'].isin([0, 1])]

df_binary.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [161]:
# getting columns names in list
columns=wine.feature_names

# feature values
X = df_binary[columns].values

# target value
y = df_binary["target"].values

In [162]:
# scaling our features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [163]:
# initiaing list to store performance metrics
performance_metrics = []

Base Class

In [164]:
class Model(ABC):
    @abstractmethod
    def fit(self, X, y):
        pass
    @abstractmethod
    def predict(self, X):
        pass
    @abstractmethod
    def cost_function(self, y_true, y_pred, eps=1e-15):
        pass

log regression

In [165]:
class LogisticRegressionManual(Model):
    def __init__(self, lr, epochs, fit_intercept=True):
        # Initialize coefficients (weights for features) as None
        self.coef_ = None
        # Initialize intercept (bias term) to 0
        self.intercept_ = 0.0
        # Whether to include an intercept in predictions
        self.fit_intercept = fit_intercept
        # Learning rate for gradient descent
        self.lr = lr
        # Number of iterations for training
        self.epochs = epochs
    
    def sigmoid(self, z):
        # Sigmoid function to map any real value to (0, 1) if y > .5, 1 else 0
        # z is linear combination of features and weights
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        #vectorize
        X = np.array(X)
        y = np.array(y)

        # Number of samples and features
        n_samples, n_features = X.shape

        # Initialize coefficients (weights) to zeros
        self.coef_ = np.zeros(n_features)
        self.intercept_ = 0.0

        # Gradient descent for specified number of epochs
        for _ in range(self.epochs):
            # Linear model: weighted sum of inputs plus intercept
            # z = X * weights + intercept
            linear_model = np.dot(X, self.coef_) + self.intercept_
            # Apply sigmoid to convert linear scores to predicted probabilities
            y_predicted = self.sigmoid(linear_model)

            # Compute gradients for weights and intercept -- 1/m * X.T * (y_pred - y)
            # greadient wrt to weights
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            # gradient wrt to intercept
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update weights and intercept using gradients (opposite dir of gradient) and learning rate
            self.coef_ -= self.lr * dw
            self.intercept_ -= self.lr * db
    
    def predict_proba(self, X):
        # predict probabilities using learned coefficients and intercept, returns 0<val<1
        linear_model = np.dot(X, self.coef_) + self.intercept_
        return self.sigmoid(linear_model)

    def predict(self, X, threshold=.5):
        # probabilities to get predicted classes 0 or 1 based on threshold
        # if prob > threshold, class 1 else class 0
        # default threshold is .5
        y_predicted_probs = self.predict_proba(X)
        return np.where(y_predicted_probs >= threshold, 1, 0)
    
    def cost_function(self, y_true, y_pred, eps=1e-15):
        n = len(y_true)
        total_loss = 0.0
        for yt, yp in zip(y_true, y_pred):
            # Clamp predicted probability to avoid log(0)
            yp = min(max(yp, eps), 1 - eps)

            # Correct log loss formula
            total_loss += yt * math.log(yp) + (1 - yt) * math.log(1 - yp)

        return -total_loss / n
     

applying logistric regression class

In [166]:
# Logistic Regression
log_reg = LogisticRegressionManual(
    lr = .05, 
    epochs = 1000, 
    fit_intercept=True
)

#starting timer
start = time.time()

#fitting model
log_reg.fit(X_scaled, y)

#ending timer
end = time.time()

#predicting target variable
y_pred_gd = log_reg.predict(X_scaled)

y_pred_probabilites = log_reg.predict_proba(X_scaled)

#appending perf metrics for model intno perf metric list
performance_metrics.append(
    {
        "Model": "Logistic Regression",
        "Intercept": log_reg.intercept_,
        "Coefficients": log_reg.coef_,
        "Fit Time": end - start,
        "Log Loss": log_reg.cost_function(y, y_pred_probabilites)
    }
)

Sklearn

In [167]:
# scikit-learn logistic regression 
clf = LogisticRegression(fit_intercept=True, random_state=0, max_iter=1000)

# starting timer 
start = time.time()

# fitting model (use training set if you have one)
clf.fit(X_scaled, y)

# ending timer after fit is complete
end = time.time()

# predicting target var (on test set)
y_pred_clf = clf.predict(X_scaled)

# probabilities of each class
y_pred_prob_clf = clf.predict_proba(X_scaled)

# appending perf metrics
performance_metrics.append(
    {
        "Model": "Scikit-learn",
        "Intercept": clf.intercept_.tolist(),
        "Coefficients": clf.coef_.flatten().tolist(),
        "Log Loss": log_loss(y, y_pred_prob_clf),
        "Fit Time": end - start,
    }
)


In [168]:
print(performance_metrics)

[{'Model': 'Logistic Regression', 'Intercept': np.float64(0.23321491706700076), 'Coefficients': array([-1.67277596, -0.52356314, -0.94515328,  1.24233698, -0.28758118,
       -0.10632093, -0.35759491,  0.18441816,  0.20693603, -0.99012793,
        0.14825953, -0.59843979, -1.84743389]), 'Fit Time': 0.016946792602539062, 'Log Loss': np.float64(0.031524299828198994)}, {'Model': 'Scikit-learn', 'Intercept': [0.22595648396817414], 'Coefficients': [-1.5404136220653013, -0.49434902209952236, -0.9703864243666552, 1.2410124970878535, -0.23761342576419145, -0.03573489385890136, -0.32915200014943036, 0.1743875553449435, 0.18672440464621057, -0.7999574680304715, 0.15093769285273284, -0.6275202021200499, -1.8145746167337014], 'Log Loss': 0.03422152101737459, 'Fit Time': 0.013071775436401367}]


In [169]:
import pandas as pd

# convert list of dicts into DataFrame
perf_table = pd.DataFrame(performance_metrics)

# show table
print(perf_table)

# if you want a nicer display in Jupyter
from IPython.display import display
display(perf_table)


                 Model              Intercept  \
0  Logistic Regression               0.233215   
1         Scikit-learn  [0.22595648396817414]   

                                        Coefficients  Fit Time  Log Loss  
0  [-1.6727759642951585, -0.5235631440656101, -0....  0.016947  0.031524  
1  [-1.5404136220653013, -0.49434902209952236, -0...  0.013072  0.034222  


Unnamed: 0,Model,Intercept,Coefficients,Fit Time,Log Loss
0,Logistic Regression,0.233215,"[-1.6727759642951585, -0.5235631440656101, -0....",0.016947,0.031524
1,Scikit-learn,[0.22595648396817414],"[-1.5404136220653013, -0.49434902209952236, -0...",0.013072,0.034222


Analyzing and Comparing Performance

In [170]:
#  Confusion matrix, Accuracy,  Precision, Recall, and ROC curve

In [171]:
# Confusion matrix
cm_manual = confusion_matrix(y, y_pred_gd)
cm_sklearn = confusion_matrix(y, y_pred_clf)

# Accuracy
acc_manual = accuracy_score(y, y_pred_gd)
acc_sklearn = accuracy_score(y, y_pred_clf)

# Precision
prec_manual = precision_score(y, y_pred_gd)
prec_sklearn = precision_score(y, y_pred_clf)

# Recall
recall_manual = recall_score(y, y_pred_gd)
recall_sklearn = recall_score(y, y_pred_clf)


In [172]:
# ROC curve values
fpr_manual, tpr_manual, _ = roc_curve(y, y_pred_probabilites)
fpr_sklearn, tpr_sklearn, _ = roc_curve(y, y_pred_prob_clf)

# AUC (area under curve)
roc_auc_manual = auc(fpr_manual, tpr_manual)
roc_auc_sklearn = auc(fpr_sklearn, tpr_sklearn)

# Plot
plt.figure(figsize=(6,6))
plt.plot(fpr_manual, tpr_manual, label=f'Manual (AUC = {roc_auc_manual:.3f})')
plt.plot(fpr_sklearn, tpr_sklearn, label=f'Scikit (AUC = {roc_auc_sklearn:.3f})')
plt.plot([0,1], [0,1], 'k--')  # baseline
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


ValueError: y should be a 1d array, got an array of shape (130, 2) instead.

In [None]:
print("Manual Logistic Regression")
print("Confusion Matrix:\n", cm_manual)
print(f"Accuracy: {acc_manual:.3f}")
print(f"Precision: {prec_manual:.3f}")
print(f"Recall: {recall_manual:.3f}")

print("\nScikit-learn Logistic Regression")
print("Confusion Matrix:\n", cm_sklearn)
print(f"Accuracy: {acc_sklearn:.3f}")
print(f"Precision: {prec_sklearn:.3f}")
print(f"Recall: {recall_sklearn:.3f}")

