## Two different approaches to Logistic Regression. 
1. ready-to-use scikit-learn model 
2. Built-in one from scratch

In [81]:
# Essential libraries for the project
import numpy as np 
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, precision_score, accuracy_score
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [67]:
# Load the dataset and perform some basic data preprocessing
data = load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)

print(f"Total number of samples: {df.shape[0]}\t Number of Features: {df.shape[1]}")

# convert every feature's dtype from float64 to float32 for more performant memory useage
df = df.astype({col: 'float32' for col in df.select_dtypes(include='float').columns})

Total number of samples: 569	 Number of Features: 30


In [68]:
# separating more important columns as features

X = df[['mean radius', 'mean texture', 'mean smoothness']]
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X,y, train_size=0.8, random_state=42,
)

In [69]:
# First is it done via the predefined and ready-to-use LogisticRegression model from scikit-learn lib
# Then the report is gathered and will eventually compared to out model which is going to be from scratch

model = LogisticRegression()
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [70]:
y_pred_sk = model.predict(X_test)
classification_rep  = classification_report(y_test, y_pred_sk)
precision_scr = precision_score(y_test, y_pred_sk)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92        43
           1       0.94      0.96      0.95        71

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114



In [71]:
# This part is devoted to building a model from scratch.

# logistic (sigmoid) function is used to scale the output of 
# wTx to an interval of [0, 1]. It is a clever choice due to its differentiability
def sigmoid(z):
    return 1 / (1 + np.exp(-z))



# Binary cross-entropy loss
def compute_loss(X, y, w):
    z = X @ w
    predictions = sigmoid(z)
    loss = -np.mean(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
    return loss


# Compute gradient of the loss weights
def compute_gradient(w, X, y):
    z = X @ w
    predictions = sigmoid(z)
    errors = predictions - y
    gradient = X.T @ errors / len(y)
    return gradient

# Evaluate validation accuracy
def validation_accuracy(w, X_val, y_val):
    probabilities = sigmoid(X_val @ w)
    predictions = (probabilities > 0.5).astype(int)
    accuracy = np.mean(predictions == y_val)
    return accuracy

def gradient_descent(X, y, X_test, y_test, learning_rate = 0.01, n_iters=5000,tolerance=1e-6):
    # Initialize weights
    w = np.zeros(X.shape[1])
    weight_history = [w.copy()]

    loss_history = [compute_loss(X, y, w)]
    validation_history = [validation_accuracy(w, X_test, y_test)]

    for step in range(1, n_iters + 1):
        grad = compute_gradient(w, X, y)
        w -= learning_rate * grad

        # store loss each iteration
        loss = compute_loss(X, y, w)
        loss_history.append(loss)

        # store accuracy on validation data
        validation = validation_accuracy(w, X_test, y_test)
        validation_history.append(validation)

        # store weights every 10 step
        if step % 10 == 0:
            weight_history.append(w.copy())
        
        # Check for convergence
        if np.abs(loss_history[-2] - loss_history[-1]) < tolerance:
            print(f'Converged at step {step}')
            break
        
        # Log progress every 100 steps
        if step % 100 == 0:
            print(f'Step {step}: Loss = {loss:.4f}, Validation Accuracy = {validation:.4f}')
    
    return w, loss_history, validation_history, weight_history


In [75]:
w , loss_history , validation_history, weights_history = gradient_descent(
    X_train, y_train, X_test, y_test
    )

model_accuracy = validation_accuracy(w, X_test, y_test)


Step 100: Loss = 0.5483, Validation Accuracy = 0.8947
Step 200: Loss = 0.4676, Validation Accuracy = 0.8947
Step 300: Loss = 0.4170, Validation Accuracy = 0.8860
Step 400: Loss = 0.3824, Validation Accuracy = 0.8860
Step 500: Loss = 0.3572, Validation Accuracy = 0.8947
Step 600: Loss = 0.3379, Validation Accuracy = 0.8947
Step 700: Loss = 0.3227, Validation Accuracy = 0.9123
Step 800: Loss = 0.3103, Validation Accuracy = 0.9123
Step 900: Loss = 0.3000, Validation Accuracy = 0.9123
Step 1000: Loss = 0.2913, Validation Accuracy = 0.9123
Step 1100: Loss = 0.2839, Validation Accuracy = 0.9123
Step 1200: Loss = 0.2774, Validation Accuracy = 0.9123
Step 1300: Loss = 0.2718, Validation Accuracy = 0.9123
Step 1400: Loss = 0.2668, Validation Accuracy = 0.9123
Step 1500: Loss = 0.2624, Validation Accuracy = 0.9123
Step 1600: Loss = 0.2584, Validation Accuracy = 0.9123
Step 1700: Loss = 0.2548, Validation Accuracy = 0.9123
Step 1800: Loss = 0.2515, Validation Accuracy = 0.9123
Step 1900: Loss = 0

### Final comparison between sklearn's Logistic model and the one we built from scratch

In [80]:
builtin_model_classification_report = model_accuracy
sklearn_model_classification_report = accuracy_score(y_test, y_pred_sk)

print(f"""Accuracy of scikit-learn logistic regression model is: {sklearn_model_classification_report}\n 
Accuracy of built-in logistic regression model is: {builtin_model_classification_report}
      """)

Accuracy of scikit-learn logistic regression model is: 0.9385964912280702
 
Accuracy of built-in logistic regression model is: 0.9298245614035088
      
