In [1]:
# Methods to be used repeatedly
def Verify(expression: bool, message: str):
    if not expression:
        raise Exception(message)
    else:
        return
    
def report_missing_features(X):
    report = X.isna().sum(axis=1) 
    report = report[report > 0]
    print("Entry index | Number of missing features")
    print(report)



In [2]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np


### step 1 is to obtain the data and remove rows with missing features, we will store these in two objects of type dataset
class dataset:
    def __init__(self, features : np.ndarray, targets : np.ndarray, var_info : np.ndarray):
        self.features = features
        self.targets = targets
        self.var_info = var_info

# fetch dataset for age predictions
national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset = fetch_ucirepo(id=887) 

X = national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset.data.features
missing_values = X.isna().sum().sum()
if (missing_values>1):
    print("Report of missing features for age prediction:")
    report_missing_features(X)
print("Number of missing values in the age prediction dataset: " + str(missing_values))
nan_indices = X[X.isna().any(axis=1)].index
X = X.dropna().to_numpy()

y = national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset.data.targets
y = y.drop(index=nan_indices)
y = y.to_numpy()
if not np.all((y == "Adult") | (y == "Senior")):
    raise ValueError("Array contains an entry that is not 'Adult' or 'Senior'")
y = np.where(y == "Senior", 1, 0)

Verify(len(X) == len(y), "Features and targets are different lengths.")
var_info = np.array(["Respondent Gender", "Activity", "BMI", "Blood Glucose", "Diabetic", "Oral", "Blood Insulin"])
ageDataset = dataset(X, y, var_info)






# fetch dataset for breast cancer detection
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)  

X = breast_cancer_wisconsin_original.data.features
missing_values = X.isna().sum().sum()
if (missing_values>1):
    print("Report of missing features for breast cancer detection:")
    report_missing_features(X)
print("Number of missing values in the breast cancer prediction dataset: " + str(missing_values))
nan_indices = X[X.isna().any(axis=1)].index
X = X.dropna().to_numpy()

y = breast_cancer_wisconsin_original.data.targets
y = y.drop(index=nan_indices)
y = y.to_numpy()
if not np.all((y == 4) | (y == 2)):
    raise ValueError("Array contains an entry that is not 4 or 2")
y = np.where(y == 4, 1, 0)


Verify(len(X) == len(y), "Features and targets are different lengths.")
var_info = np.array(["Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses"])
breastDataset = dataset(X,y,var_info)

print("\n\n Datasets cleaned.")

Number of missing values in the age prediction dataset: 0
Report of missing features for breast cancer detection:
Entry index | Number of missing features
23     1
40     1
139    1
145    1
158    1
164    1
235    1
249    1
275    1
292    1
294    1
297    1
315    1
321    1
411    1
617    1
dtype: int64
Number of missing values in the breast cancer prediction dataset: 16


 Datasets cleaned.


In [3]:
### Step 2 is to find the means in the positive and negative set for each features
### We'll do this by going back to pandas
import matplotlib.pyplot as plt

# first for the age dataset
targets = ageDataset.targets.squeeze()
x = ageDataset.features.copy()
positive_x = x[targets == 1]
negative_x = x[targets == 0]
means_positive = np.mean(positive_x, axis=0)
means_negative = np.mean(negative_x, axis=0)
squared_difference = np.power(means_positive-means_negative, 2)

# tabling
mean_values_df = pd.DataFrame({
    'Mean Positive': means_positive,
    'Mean Negative': means_negative,
    'Squared Difference': squared_difference
}, index=ageDataset.var_info)
print(mean_values_df)


#second for the breast dataset
targets = breastDataset.targets.squeeze()
x = breastDataset.features.copy()
positive_x = x[targets == 1]
negative_x = x[targets == 0]
means_positive = np.mean(positive_x, axis=0)
means_negative = np.mean(negative_x, axis=0)
squared_difference = np.power(means_positive-means_negative, 2)

# tabling
mean_values_df = pd.DataFrame({
    'Mean Positive': means_positive,
    'Mean Negative': means_negative,
    'Squared Difference': squared_difference
}, index=breastDataset.var_info)
print(mean_values_df)

                   Mean Positive  Mean Negative  Squared Difference
Respondent Gender       1.508242       1.512017            0.000014
Activity                1.909341       1.806165            0.010645
BMI                    27.886264      27.968286            0.006728
Blood Glucose         104.329670      98.644723           32.318625
Diabetic                2.027473       2.014107            0.000179
Oral                  141.208791     109.990596          974.575736
Blood Insulin          10.405247      12.106661            2.894810
                             Mean Positive  Mean Negative  Squared Difference
Clump Thickness                   7.188285       2.963964           17.844884
Uniformity of Cell Size           6.577406       1.306306           27.784490
Uniformity of Cell Shape          6.560669       1.414414           26.483941
Marginal Adhesion                 5.585774       1.346847           17.968504
Single Epithelial Cell Size       5.326360       2.108108         

In [5]:
### Step 3 is the implementation of the KNN model
class KNN:
    def __init__(self, distanceFunctor, k=1, weighted=True, normalize=True):
        self.f = distanceFunctor
        self.k = k
        self.weighted = True
        self.normalize = True

    def fit(self, features : np.ndarray, targets : np.ndarray):
        if self.normalize:
            self.mins = np.min(features, axis = 0)
            self.maxs = np.max(features, axis = 0)
            self.features = (features - self.mins)/(self.maxs-self.mins)
        else:
            self.features = features
        self.targets = targets
    
    #Returns a single lable -> Only works for binary classification (0,1).
    #The returned value is the probability of the true label being 1
    def predict(self, input:np.ndarray):
        #We create a new vector where each value is the distance between the input and each feature vector
        Verify(input.size == self.features.shape[1], "Improper size of input vector during inferencing.")
        normalizedInput = input.__deepcopy__
        if self.normalize:
            normalizedInput = (input - self.mins)/(self.maxs-self.mins)
        distances = self.f(normalizedInput, self.features) #one entry per row in the features matrix
        smallestIndices = np.argpartition(distances, self.k)[:self.k]

        #We can now access the K nearest labels and compute the probabilities
        outputLabelUnclamped = 0.0
        sumOfWeights = 0.0
        for index in smallestIndices:
            currentWeight = 1
            if self.weighted:
                epsilon = 1e-10
                distance = self.f(self.features[index], normalizedInput) + epsilon #avoid division by 0
                currentWeight = 1/distance #Weighted by current distance
            sumOfWeights+=currentWeight
            outputLabelUnclamped += self.targets[index]
        outputLabelUnclamped/=sumOfWeights
        return outputLabelUnclamped

                
#Our functor for computing similarity
class EuclideanDistance:
    def __call__(self, input_vector : np.ndarray, feature_matrix : np.ndarray):
        return np.sqrt(np.sum((feature_matrix - input_vector)**2, axis=1)) #vectorized
    

def evaluate_acc(trueLabels, predictedLabels):
    correctPredictions = np.sum(trueLabels == predictedLabels)
    accuracy = correctPredictions/len(trueLabels)
    return accuracy

def SplitTrainTest(x:np.ndarray, y:np.ndarray, splitRatio = 0.8):
    np.random.seed(420)
    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)

    x_shuffled = x[indices]
    y_shuffled = y[indices]
    train_size = int(x.shape[0] * splitRatio)

    x_train = x_shuffled[:train_size]
    y_train = y_shuffled[:train_size]
    x_test = x_shuffled[train_size:]
    y_test = y_shuffled[train_size:]
    return x_train, y_train, x_test, y_test


def TestKNN(threshold = 0.5):
    # A quick test to see if the KNN works
    x = breastDataset.features
    y = breastDataset.targets
    x_train, y_train, x_test, y_test = SplitTrainTest(x, y)

    ks = [i for i in range(1,21)]
    accuracies = []
    for k in ks:
        model = KNN(EuclideanDistance(), k=k)
        model.fit(x_train, y_train)
        # Initialize an empty list to store predicted probabilities
        y_pred_prob = []
        # Iterate over each instance in x_test
        for instance in x_test:
            # The model's predict method expects a 1D numpy array
            prob = model.predict(instance.reshape(1, -1))
            y_pred_prob.append(prob)
        y_pred_prob = np.array(y_pred_prob)
        # Threshold the probabilities to get binary predictions
        y_pred = np.where(y_pred_prob > threshold, 1, 0)
        accuracy = evaluate_acc(y_test, y_pred)
        accuracies.append(accuracy)
        print(f"KNN accuracy for k = {k}: {accuracy*100:.2f}%")

TestKNN()







Traceback (most recent call last):
  File "_pydevd_bundle/pydevd_cython.pyx", line 1078, in _pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch
  File "_pydevd_bundle/pydevd_cython.pyx", line 297, in _pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend
  File "/Users/zacharygurberg/opt/anaconda3/envs/GrITPythonEnv/lib/python3.9/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 1976, in do_wait_suspend
    keep_suspended = self._do_wait_suspend(thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)
  File "/Users/zacharygurberg/opt/anaconda3/envs/GrITPythonEnv/lib/python3.9/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 2011, in _do_wait_suspend
    time.sleep(0.01)
KeyboardInterrupt


KeyboardInterrupt: 