In [2]:
# To write a Python 2/3 compatible codebase, the first step is to add this line to the top of each module
from __future__ import division, print_function, unicode_literals

# The simplest possible classifier is the nearest neighbor: given a new observation X_test, find in the training set (i.e.
# the data used to train the estimator) the observation(s) with the closest feature vector.
# This script illustrates the usage of kNN. 

# Import necessary libraries and specify that graphs should be plotted inline.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
import pandas as pd




url="http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
c=pd.read_csv(url,header=None)
c
# Import iris data set
# See 'ISOM 672_Decision Trees' for a description of this data set


X = c.iloc[:,2:]  # we only take the first two features in order to easily visualize the results. 
                      # We could avoid this ugly slicing by using a two-dim dataset
y = c.iloc[:,1]
print(y.head())

0    M
1    M
2    M
3    M
4    M
Name: 1, dtype: object


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y)

In [4]:
print('Labels counts in class M in y_train:', np.bincount(y_train=='M'))
print('Labels counts in class B in y_train:', np.count_nonzero(y_train=='B'))
print('Labels counts in class M in y_test:', np.count_nonzero(y_test=='M'))
print('Labels counts in class B in y_test:', np.count_nonzero(y_test=='B'))
                                                              

Labels counts in class M in y_train: [250 148]
Labels counts in class B in y_train: 250
Labels counts in class M in y_test: 64
Labels counts in class B in y_test: 107


In [5]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [6]:
from sklearn import neighbors, datasets

knn = neighbors.KNeighborsClassifier(n_neighbors=3, 
                           p=2, 
                           metric='minkowski') #The default metric is minkowski, which is a generalization of the Euclidean distance
                                               # with p=2 is equivalent to the standard Euclidean distance.
knn = knn.fit(X_train_std, y_train)            # with p=1 is equivalent to the Mahattan distance.

In [7]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [8]:
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, classification_report

# Estimate the predicted values by applying the kNN algorithm
y_pred = knn.predict(X_test_std)
y_pred_insample = knn.predict(X_train_std)

# Accuracy
print('Accuracy (out-of-sample): %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy (in-sample): %.2f' % accuracy_score(y_train, y_pred_insample))

# F1 score
print('F1 score (out-of-sample): ', f1_score(y_test, y_pred, average='macro'))
print('F1 score (in-sample)    : ', f1_score(y_train, y_pred_insample, average='macro'))

# Kappa score
print('Kappa score (out-of-sample): ', cohen_kappa_score(y_test, y_pred))
print('Kappa score (in-sample)    : ', cohen_kappa_score(y_train, y_pred_insample))

# Build a text report showing the main classification metrics (out-of-sample performance)
print(classification_report(y_test, y_pred, target_names=["M","B"]))

Accuracy (out-of-sample): 0.96
Accuracy (in-sample): 0.99
F1 score (out-of-sample):  0.9555629802873371
F1 score (in-sample)    :  0.9864973978653675
Kappa score (out-of-sample):  0.9112083673318003
Kappa score (in-sample)    :  0.9729964447580536
             precision    recall  f1-score   support

          M       0.95      0.99      0.97       107
          B       0.98      0.91      0.94        64

avg / total       0.96      0.96      0.96       171



In [11]:
from sklearn.metrics import accuracy_score, f1_score, classification_report,confusion_matrix
Confusion_Matrix = pd.DataFrame(confusion_matrix(y_test,y_pred),columns=["True benign","True malignant"],index=["Predicted benign","Predicted malignant"])
Confusion_Matrix

Unnamed: 0,True benign,True malignant
Predicted benign,106,1
Predicted malignant,6,58
