In [12]:
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target  # 1 = malignant, 0 = benign



In [13]:
from sklearn.model_selection import train_test_split

# Split dataset: 60% train, 20% validation, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Report dataset sizes
print(f"Train set size: {len(y_train)} (Positive: {sum(y_train)}, Negative: {len(y_train)-sum(y_train)})")
print(f"Validation set size: {len(y_val)} (Positive: {sum(y_val)}, Negative: {len(y_val)-sum(y_val)})")
print(f"Test set size: {len(y_test)} (Positive: {sum(y_test)}, Negative: {len(y_test)-sum(y_test)})")


Train set size: 341 (Positive: 214, Negative: 127)
Validation set size: 114 (Positive: 72, Negative: 42)
Test set size: 114 (Positive: 71, Negative: 43)


In [9]:
class LogisticRegressionSGD:
    def __init__(self, learning_rate=0.01, batch_size=32, max_iters=1000):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.max_iters = max_iters
        self.weights = None  # Model parameters

    def sigmoid(self, z):
        #z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def compute_loss(self, y_true, y_pred):
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def compute_gradient(self, X_batch, y_batch):
        N = X_batch.shape[0]
        y_pred = self.sigmoid(X_batch @ self.weights)
        return (1 / N) * X_batch.T @ (y_pred - y_batch)

    def fit(self, X, y):
        N, D = X.shape
        self.weights = np.random.randn(D) * 0.01  # Initialize weights with Gaussian distribution

        for _ in range(self.max_iters):
            indices = np.random.choice(N, self.batch_size, replace=False)
            X_batch, y_batch = X[indices], y[indices]

            gradient = self.compute_gradient(X_batch, y_batch)
            self.weights -= self.learning_rate * gradient

    def predict_proba(self, X):
        return self.sigmoid(X @ self.weights)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)


In [10]:
learning_rates = [0.001, 0.01, 0.1]
batch_sizes = [16, 32, 64]

best_model = None
best_accuracy = 0

for lr in learning_rates:
    for batch_size in batch_sizes:
        print(f"Training model with learning_rate={lr}, batch_size={batch_size}")
        
        model = LogisticRegressionSGD(learning_rate=lr, batch_size=batch_size, max_iters=1000)
        model.fit(X_train, y_train)

        y_val_pred = model.predict(X_val)
        accuracy = np.mean(y_val_pred == y_val)

        print(f"Validation Accuracy: {accuracy:.4f}")
        
        # Save the best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model


Training model with learning_rate=0.001, batch_size=16
Validation Accuracy: 0.8860
Training model with learning_rate=0.001, batch_size=32
Validation Accuracy: 0.7368
Training model with learning_rate=0.001, batch_size=64
Validation Accuracy: 0.7982
Training model with learning_rate=0.01, batch_size=16
Validation Accuracy: 0.8772
Training model with learning_rate=0.01, batch_size=32
Validation Accuracy: 0.8772
Training model with learning_rate=0.01, batch_size=64
Validation Accuracy: 0.9123
Training model with learning_rate=0.1, batch_size=16
Validation Accuracy: 0.8947
Training model with learning_rate=0.1, batch_size=32
Validation Accuracy: 0.8860
Training model with learning_rate=0.1, batch_size=64
Validation Accuracy: 0.9123


  return 1 / (1 + np.exp(-z))


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Test model on the test set
y_test_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"\nTest Set Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")



Test Set Performance:
Accuracy: 0.9123
Precision: 0.9067
Recall: 0.9577
F1-score: 0.9315


  return 1 / (1 + np.exp(-z))


Summary: 
The logistic regression model demonstrated strong classification performance on the test set, achieving an accuracy of 91.23%, indicating overall reliability. With a precision of 90.67%, the model correctly identified malignant cases most of the time, though it occasionally misclassified benign cases as malignant (false positives). The recall of 95.77% shows that the model is highly sensitive, successfully detecting the majority of actual malignant cases, which is crucial in medical diagnosis. The F1-score of 93.15% confirms a well-balanced trade-off between precision and recall. While the model performs well, its slightly lower precision suggests that it could be improved by adjusting the decision threshold, applying regularization, or fine-tuning hyperparameters to reduce false positives. Additionally, the overflow encountered by the sigmoid function should be fixed by limiting z or using a numerically stable sigmoid. 