In [203]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

In [None]:
df = pd.read_csv("data/Cancer_data.csv")

missing_values = df.isna().sum()
# print(f"Missing values:\n{missing_values[missing_values > 0]}")  # Only prints columns with missing values

# Transform feature from string to binary. Benign is now '0', and Malignant is '1'.
df["diagnosis"] = df["diagnosis"].map({"B": 0, "M": 1})

# print(df.iloc[0])

# Select target attribute
y = df["diagnosis"]

# Drop useless features
df = df.drop(["Unnamed: 32", "diagnosis", "id"], axis=1)
# print(df.columns)

X = np.array(df)
y = np.array(y)

# NOTE: Might be worth evening out class imbalances, currently it is 357 to 212


In [205]:
class BaseLogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations=num_iterations
        self.weights = None

    def sigmoid(self, x):
        return 1 / (1+np.exp(-x))

    def fit(self, X, y):
        X = np.hstack([np.ones([X.shape[0], 1]), X])

        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        for _ in range(self.num_iterations):
            p = self.sigmoid(X @ self.weights)
            lossGradient = X.T @ (y - p) / num_samples
            self.weights = self.weights + self.learning_rate * (lossGradient)

    def predict(self, X, threshold=0.5):
        X = np.hstack([np.ones([X.shape[0], 1]), X])

        linear_combination = X @ self.weights
        activated = self.sigmoid(linear_combination)
        prediction = activated > threshold
        return prediction


In [None]:
def get_kfold_accuracy(K, model, X, y):
    CV = model_selection.KFold(n_splits=K, shuffle=True)

    model_errors = np.zeros(K)

    i = 0
    for train_index, test_index in CV.split(X):
        # extract training and test set for current CV fold
        X_train, y_train = X[train_index, :], y[train_index]
        X_test, y_test = X[test_index, :], y[test_index]

        
        model.fit(X_train, y_train)

        y_est = model.predict(X_test)
        

        missclass_rate = np.sum(y_est != y_test) / len(y_test)

        model_errors[i] = missclass_rate
        i += 1

    error = model_errors.mean()
    accuracy = 1 - error
    return accuracy, error

In [211]:
import warnings 
warnings.filterwarnings("ignore")

accuracy1, error1 = get_kfold_accuracy(10, BaseLogisticRegression(num_iterations=1000), X, y)
accuracy2, error2 = get_kfold_accuracy(10, LogisticRegression(max_iter=1000), X, y)
print(accuracy1)
print(error1)
print(accuracy2)
print(error2)


0.9069235588972431
0.0930764411027569
0.9525375939849624
0.04746240601503759
