### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random

import matplotlib.pyplot as plt

### Add csv

In [None]:
# Comma-separated values
df = pd.read_csv('cleveland.csv')

# Rename 'num' column to 'disease' and change 1,2,3,4 to 1
df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
df

### Standardizing Attributes

In [3]:


# standardize age and bp
df['age_s'] = (df.age - df.age.mean())/df.age.std()
df['trestbps_s'] = (df.trestbps - df.trestbps.mean())/df.trestbps.std()
df['chol_s'] = (df.chol - df.chol.mean())/df.chol.std()
df['fbs_s'] = (df.fbs - df.fbs.mean())/df.fbs.std()
# display(df.age_s)


### Find good attributes

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from numpy import array
from sklearn.model_selection import KFold


# Use knn on age. First create a nearest neighbors object.
nn = NearestNeighbors(n_neighbors=6, metric='euclidean', algorithm='auto')

age = 'age_s'
trestbps = 'trestbps_s'
chol = 'chol_s'
fbs = 'fbs_s'

# This builds an index data structure under the hood for query performance
X = df[[age, trestbps, chol, fbs]].values
y = df[['disease']].values


# Get random patients to test on
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

fit = nn.fit(X_train)
# Find the k nearest neighbors to the patient.
distances, indices = fit.kneighbors(X_test)


y_pred = []
for i in range(len(indices)):

    nbrs_diseased = y_train[indices[i]].flatten() # flatten pushes the arrays to be 1 dimensional used to be [[0],[1],[1]] now [0,1,1]

    predict = pd.Series(nbrs_diseased).mode()[0]  # Find the most common label
    y_pred.append(predict)

y_pred
# This is where we would compile how many patients are predicted
# correctly. Remember:
#    precision = tp/(tp+fp)  ("sloppiness")
#    recall    = tp/(tp+fn)  ("What percentage did we find?")
#    f-score - a balance between precision and recall
#    support - number of positive labels
(p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, labels=[1])
print(f'precision={p}, recall={r}, f-score={f}, support={s}')



### Implement K fold

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neighbors import NearestNeighbors


# Create a function for prediction and evaluation
def predict_and_evaluate(X, y, k_values, n_splits=10):
    results = []

    # Perform K-Fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True)

    for k in k_values:
        y_pred = []
        y_true = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Create nearest neighbors object
            nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')
            nn.fit(X_train)

            # Find the k nearest neighbors to the test set
            distances, indices = nn.kneighbors(X_test)

            for i in range(len(X_test)):
                nbrs_diseased = y_train[indices[i]].flatten()
                predict = pd.Series(nbrs_diseased).mode()[0]  # Most common label
                y_pred.append(predict)
                y_true.append(y_test[i][0])

        # Calculate precision, recall, and F1 scores
        (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, labels=[1])
        results.append((k, p, r, f, s))

    return results

# Main execution
age = 'age_s'
trestbps = 'trestbps_s'
chol = 'chol_s'
fbs = 'fbs_s'

X = df[[age, trestbps, chol, fbs]].values
y = df[['disease']].values


# Define the range of k values to try
k_values = range(1, 21)  # Test k from 1 to 20
results = predict_and_evaluate(X, y, k_values)

# Print the results
for k, p, r, f, s in results:
    print(f'k={k}, precision={p}, recall={r}, f-score={f}, support={s}')
