In [101]:

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [102]:

# Load data
file_path = "D:/Stevens/KDD/breast-cancer-wisconsin.csv"
# Reading the CSV file
data = pd.read_csv(file_path, header=None)

# Replace '?' with NaN
data = data.replace('?', np.nan)

# Drop rows with missing values
data.dropna(inplace=True)

data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Sample,F1,F2,F3,F4,F5,F6,F7,F8,F9,Class
1,1000025,5,1,1,1,2,1,3,1,1,2
2,1002945,5,4,4,5,7,10,3,2,1,2
3,1015425,3,1,1,1,2,2,3,1,1,2
4,1016277,6,8,8,1,3,4,3,7,1,2
...,...,...,...,...,...,...,...,...,...,...,...
695,776715,3,1,1,1,3,2,1,1,1,2
696,841769,2,1,1,1,2,1,1,1,1,2
697,888820,5,10,10,3,7,3,8,10,2,4
698,897471,4,8,6,4,3,4,10,6,1,4


In [103]:

# Set the first row as column names
data.columns = data.iloc[0]

# Drop the first row (which contains the column names)
data = data[1:]
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:-1], data['Class'], test_size=0.3, random_state=42)





In [104]:
# Train the k-NN models
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train, y_train)

knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)

knn10 = KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train, y_train)

# Test the models and evaluate
y_pred3 = knn3.predict(X_test)
y_pred5 = knn5.predict(X_test)
y_pred10 = knn10.predict(X_test)

print("Accuracy for k=3:", accuracy_score(y_test, y_pred3))
print("Accuracy for k=5:", accuracy_score(y_test, y_pred5))
print("Accuracy for k=10:", accuracy_score(y_test, y_pred10))


Accuracy for k=3: 0.9560975609756097
Accuracy for k=5: 0.9560975609756097
Accuracy for k=10: 0.9512195121951219


In [105]:
# Print classification report for more metrics
print("Classification report for k=3:\n", classification_report(y_test, y_pred3))
print("Classification report for k=5:\n", classification_report(y_test, y_pred5))
print("Classification report for k=10:\n", classification_report(y_test, y_pred10))



Classification report for k=3:
               precision    recall  f1-score   support

           2       0.95      0.98      0.97       127
           4       0.97      0.91      0.94        78

    accuracy                           0.96       205
   macro avg       0.96      0.95      0.95       205
weighted avg       0.96      0.96      0.96       205

Classification report for k=5:
               precision    recall  f1-score   support

           2       0.95      0.98      0.97       127
           4       0.97      0.91      0.94        78

    accuracy                           0.96       205
   macro avg       0.96      0.95      0.95       205
weighted avg       0.96      0.96      0.96       205

Classification report for k=10:
               precision    recall  f1-score   support

           2       0.94      0.98      0.96       127
           4       0.97      0.90      0.93        78

    accuracy                           0.95       205
   macro avg       0.96      0.

In [106]:

error_rate_k3 = 1 - accuracy_score(y_test, y_pred3)
error_rate_k5 = 1 - accuracy_score(y_test, y_pred5)
error_rate_k10 = 1 - accuracy_score(y_test, y_pred10)

print("Error Rate for k=3:", error_rate_k3)
print("Error Rate for k=5:", error_rate_k5)
print("Error Rate for k=10:", error_rate_k10)



Error Rate for k=3: 0.04390243902439028
Error Rate for k=5: 0.04390243902439028
Error Rate for k=10: 0.04878048780487809


In [107]:
confusion_matrix_k3 = confusion_matrix(y_test, y_pred3)
confusion_matrix_k5 = confusion_matrix(y_test, y_pred5)
confusion_matrix_k10 = confusion_matrix(y_test, y_pred10)

print("Confusion Matrix for k=3:\n", confusion_matrix_k3)
print("Confusion Matrix for k=5:\n", confusion_matrix_k5)
print("Confusion Matrix for k=10:\n", confusion_matrix_k10)

Confusion Matrix for k=3:
 [[125   2]
 [  7  71]]
Confusion Matrix for k=5:
 [[125   2]
 [  7  71]]
Confusion Matrix for k=10:
 [[125   2]
 [  8  70]]
