In [258]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from scipy import stats

In [None]:
df = pd.read_csv("data/Cancer_data.csv")

missing_values = df.isna().sum()
# print(f"Missing values:\n{missing_values[missing_values > 0]}")  # Only prints columns with missing values

# Transform feature from string to binary. Benign is now '0', and Malignant is '1'.
df["diagnosis"] = df["diagnosis"].map({"B": 0, "M": 1})

# print(df.iloc[0])

# Select target attribute
y = df["diagnosis"]

# Drop useless features
df = df.drop(["Unnamed: 32", "diagnosis", "id"], axis=1)
# print(df.columns)

X = np.array(df)
y = np.array(y)

# NOTE: Might be worth evening out class imbalances, currently it is 357 to 212
# NOTE: Consider other model performance measurements than simply missclassification rate and accuracy
# NOTE: Possible models: Logistic Regression (Done), KNN(Done), SVM(Not done), NN(Not done), Random Forests(Not done)

In [305]:
class BaseLogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations=num_iterations
        self.weights = None

    def sigmoid(self, x):
        return 1 / (1+np.exp(-x))

    def fit(self, X, y):
        X = np.hstack([np.ones([X.shape[0], 1]), X])

        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        for _ in range(self.num_iterations):
            p = self.sigmoid(X @ self.weights)
            lossGradient = X.T @ (y - p) / num_samples
            self.weights = self.weights + self.learning_rate * (lossGradient)

    def predict(self, X, threshold=0.5):
        X = np.hstack([np.ones([X.shape[0], 1]), X])

        linear_combination = X @ self.weights
        activated = self.sigmoid(linear_combination)
        prediction = activated > threshold
        return prediction
    
class BaseKNN:
    def __init__(self, X, y, K=5, distanceMeasure="Euclidean"):
        self.K = K
        self.X = X
        self.y = y
        self.distanceMeasure = distanceMeasure

    def fit(self, X, y):
        self.X = X
        self.y = y

    def getDistance(self, x1, x2):
        if self.distanceMeasure == "Euclidean":
            # distance = np.sqrt(np.sum((x1 - x2)**2))
            distance = np.linalg.norm(x1 - x2)
        return distance

    def get_K_NN(self, x):
        distances = np.array([self.getDistance(row, x) for row in self.X])
        K_neighbours_indices = np.argsort(distances)[:self.K]
        NN_classes = self.y[K_neighbours_indices]
        
        return NN_classes
    
    def predict(self, X):
        predictions = [stats.mode(self.get_K_NN(x), keepdims=False).mode for x in X]
        return np.array(predictions)
        # K_NN = self.get_K_NN(x)
        # # print(K_NN.shape)
        # return stats.mode(K_NN)

In [306]:
def get_kfold_accuracy(K, model, X, y):
    CV = model_selection.KFold(n_splits=K, shuffle=True)

    model_errors = np.zeros(K)

    i = 0
    for train_index, test_index in CV.split(X):
        # extract training and test set for current CV fold
        X_train, y_train = X[train_index, :], y[train_index]
        X_test, y_test = X[test_index, :], y[test_index]

        
        model.fit(X_train, y_train)

        y_est = model.predict(X_test)
        

        missclass_rate = np.sum(y_est != y_test) / len(y_test)

        model_errors[i] = missclass_rate
        i += 1

    error = model_errors.mean()
    accuracy = 1 - error
    return accuracy, error

In [307]:
import warnings 
warnings.filterwarnings("ignore")
# Compare base Logistic Regression to task Logistic Regression through K-fold cross validation
base_accuracy, base_error = get_kfold_accuracy(10, BaseLogisticRegression(num_iterations=1000), X, y)
nonbase_accuracy, nonbase_error = get_kfold_accuracy(10, LogisticRegression(max_iter=1000), X, y)

print(f"Base model accuracy: {base_accuracy}\nBase model error: {base_error}\n")
print(f"Nonbase model accuracy: {nonbase_accuracy}\nNonbase model error: {nonbase_error}\n")


Base model accuracy: 0.9102756892230577
Base model error: 0.08972431077694235

Nonbase model accuracy: 0.9526315789473684
Nonbase model error: 0.047368421052631574



In [308]:
# Compare base KNN to task KNN through K-fold cross validation
base_accuracy, base_error = get_kfold_accuracy(10, BaseKNN(X, y, 5), X, y)
nonbase_accuracy, nonbase_error = get_kfold_accuracy(10, KNeighborsClassifier(n_neighbors=5), X, y)

print(f"Base model accuracy: {base_accuracy}\nBase model error: {base_error}\n")
print(f"Nonbase model accuracy: {nonbase_accuracy}\nNonbase model error: {nonbase_error}\n")

Base model accuracy: 0.9314849624060151
Base model error: 0.06851503759398496

Nonbase model accuracy: 0.9278195488721804
Nonbase model error: 0.07218045112781954

