In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.spatial.distance as sp_dist
import seaborn

In [2]:
plt.rcParams["figure.figsize"] = (60,30)

seaborn.set(font_scale=2.5)
seaborn.set_style("whitegrid")

In [3]:
K = 5

In [4]:
cancer = (
    pd.read_csv(
        "~/Documents/github/classification-exercises/k_nearest_neighbors/BreastCancer.csv",
        header=0,
        names=["id", "diag", "radius", "texture", "perimeter", "area", "smoothness",
               "compactness", "concavity", "concave_points", "symmetry", "fractal_dimension"])
    .sample(frac=1)
    .dropna(axis=0)
    .drop(columns=["id"])
    .replace({"diag": {"M": 1, "B": 0}})
)

X = cancer.drop(columns=["diag"]).to_numpy()
y = cancer["diag"].values

dim_ = X.shape
print("Array shape: "+str(dim_))

Array shape: (567, 10)


In [5]:
pct_train = 0.80

idx_train = np.random.choice(range(dim_[0]), round(pct_train * dim_[0]), replace=False)
idx_test = [i for i in range(dim_[0]) if i not in idx_train]

Xtrain = X[idx_train]
Xtest = X[idx_test]
ytrain = y[idx_train]
ytest = y[idx_test]

In [6]:
list_targets = [("Complete", y), ("Train", ytrain), ("Test", ytest)]

for name_, array_ in list_targets:
    
    df_counts = pd.DataFrame(np.unique(array_, return_counts=True)).T
    df_counts.columns = ["Target", "Count"]
    df_counts["Percent"] = df_counts["Count"] / sum(df_counts["Count"])
    print("{0} target value count:\n{1}\n".format(name_, df_counts))

Complete target value count:
   Target  Count  Percent
0       0    357  0.62963
1       1    210  0.37037

Train target value count:
   Target  Count   Percent
0       0    287  0.632159
1       1    167  0.367841

Test target value count:
   Target  Count   Percent
0       0     70  0.619469
1       1     43  0.380531



In [7]:
def knn(X, y, Xtest, num_neighbors):
    """
    Execute k nearest neighbors classification algorithm.

    Parameters
    ----------
    X : Dd-array
        the training dataframe
    y : 1d-array
        the training response values
    Xtest : Dd-array
        the testing dataframe for which predictions are sought
    num_neighbors : integer
        the number of neighbors used to generate the predictions

    Returns
    -------
    array
        the predicted classes array, has length equal to number of rows in Xtest
    """
    dists = sp_dist.cdist(Xtest, X, "minkowski", p=2)
    min_indices = [dists[i,:].argsort()[:num_neighbors] for i in range(Xtest.shape[0])]
    ypred = [np.argmax(np.bincount(y[i])) for i in min_indices]
    return np.array(ypred)

In [8]:
ypred = knn(X=Xtrain, y=ytrain, Xtest=Xtest, num_neighbors=K)

In [9]:
tp = len(np.where((ytest == 1) & (ypred == 1))[0])
fp = len(np.where((ytest == 0) & (ypred == 1))[0])
fn = len(np.where((ytest == 1) & (ypred == 0))[0])
tn = len(np.where((ytest == 0) & (ypred == 0))[0])

print(
    """
    Confusion Matrix
    
    Rows: Actuals, Columns: Predictions
    
    [[tn:{tn}, fp:{fp}], 
     [fn:{fn}, tp:{tp}]]
    """.format(tn=tn, fp=fp, fn=fn, tp=tp)
)


    Confusion Matrix
    
    Rows: Actuals, Columns: Predictions
    
    [[tn:68, fp:2], 
     [fn:5, tp:38]]
    


In [10]:
metrics = {
    "Metric": [
        "Accurary", 
        "ErrorRate", 
        "F1", 
        "FPR", 
        "FNR"],
    "Score": [
        round((tp + tn) / (tp + fp + fn + tn), 2), 
        round(1 - ((tp + tn) / (tp + fp + fn + tn)), 2), 
        round((2 * tp) / ((2 * tp) + fp + fn), 2), 
        round(fp / (fp + tn), 2), 
        round(fn / (fn + tp), 2)]
}

metrics = pd.DataFrame(metrics)
metrics

Unnamed: 0,Metric,Score
0,Accurary,0.94
1,ErrorRate,0.06
2,F1,0.92
3,FPR,0.03
4,FNR,0.12
