In [151]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import math
import numpy as np

def choose_k(dataframe):
    rows = dataframe.shape[0]
    k = int(math.sqrt(rows))
    if k % 2 == 0:
        k+=1
    return k

df = pd.read_csv('ov4-breast-cancer.csv')

# Wash. Replace rows with values '?' with the mean of the current column
df.replace('?', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
column_means = df.mean()
df.fillna(column_means, inplace=True)    
    
y = df['classes']
X = df.drop(columns=['classes'], axis=1)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Choose k
k = choose_k(y_test)

# Train
knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')

knn.fit(X_train_scaled, y_train)

# Predict
predicted_values = knn.predict(X_test_scaled)

# Assess 
cm = confusion_matrix(y_test, predicted_values)
print(cm)

# My k value was 11 initially. After experimenting with other k-values I found out that there wasn't
# any big difference between the results.

[[98  3]
 [ 3 36]]
