In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import random

import matplotlib.pyplot as plt

# Part 1

In [None]:
df = pd.read_csv('cleveland.csv')

df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))

In [None]:
df['age_s'] = (df.age-df.age.mean())/df.age.std()
df['trestbps_s'] = (df.trestbps-df.trestbps.mean())/df.trestbps.std()
df['chol_s'] = (df.chol - df.chol.mean()) / df.chol.std()
df['thalach_s'] = (df.thalach - df.thalach.mean()) / df.thalach.std()
df['fbs_s'] = (df.fbs - df.fbs.mean()) / df.fbs.std()

In [None]:
def get_scores(k):
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

    X = df[['age_s', 'trestbps_s', 'chol_s', 'thalach_s']].values
    fit = nn.fit(X)

    n = 100

    patients = df.sample(n)
    patientsX = patients[['age_s', 'trestbps_s', 'chol_s', 'thalach_s']].values
    patientsY = patients[['disease']].values
    
    distances, indices = fit.kneighbors(patientsX)

    y_pred = []
    for i in range(n):
        nbrs = df.iloc[indices[i]]
        nbrs = nbrs.drop(patients.index[i], errors='ignore')
    
        healthy = nbrs[nbrs.disease == 0].count().disease
        sick = nbrs[nbrs.disease == 1].count().disease
        predict = 0 if (healthy > sick) else 1
    
        y_pred.append(predict)
    
    return precision_recall_fscore_support(patientsY, y_pred, labels=[1])

kvals = range(2, 50)
scores = [get_scores(k) for k in kvals]

scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

plt.plot(kvals, scores[2])
plt.xlabel("K-Value")
plt.ylabel("F1 Score")
plt.savefig("kvalue.pdf")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

k = 18
nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

X = df[['age_s', 'trestbps_s', 'chol_s', 'thalach_s']].values
y = df[['disease']].values

precision, recall, f1 = [], [], []

for x in range(100):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    fit = nn.fit(X_train)

    y_pred = []

    distances, indices = fit.kneighbors(X_test)

    for i in range(len(y_test)):
        # print('nearest neighbors to patient: {}:'.format(patientsX[i]))
        nbrs = y_train[indices[i]]
        # Drop the patient of interest
        currentPatient = y_test[i][0]

        healthy = 0
        sick = 0
        for j in range(len(nbrs) - 1):
            if nbrs[j] == 0:
                healthy += 1
            elif nbrs[j] == 1:
                sick += 1
        predict = 0 if (healthy > sick) else 1
        y_pred.append(predict)
        
    (p, r, f, s) = precision_recall_fscore_support(y_test, y_pred, labels=[1])
    
    precision.append(p)
    recall.append(r)
    f1.append(f)

print("Precision Scores:")
print(precision)
print("Recall Scores:")
print(recall)
print("F1 Scores:")
print(f1)
meanF1 = sum(f1) / len(f1)
print("Mean F1 Score: " + str(meanF1))