In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
breast_cancer = pd.read_csv("data/breast-cancer-wisconsin.data", header=None)

In [3]:
breast_cancer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
breast_cancer.columns = ["id_number", "clump_thickness", "unif_cell_size", "unif_cell_shape", 
                         "marg_adhesion", "single_epith_cell_size","bare_nuclei", "bland_chromatin",
                         "normal_nucleoli", "mitoses", "class"]

In [5]:
breast_cancer.head()

Unnamed: 0,id_number,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id_number                 699 non-null int64
clump_thickness           699 non-null int64
unif_cell_size            699 non-null int64
unif_cell_shape           699 non-null int64
marg_adhesion             699 non-null int64
single_epith_cell_size    699 non-null int64
bare_nuclei               699 non-null object
bland_chromatin           699 non-null int64
normal_nucleoli           699 non-null int64
mitoses                   699 non-null int64
class                     699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [7]:
breast_cancer.bare_nuclei.value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

In [8]:
breast_cancer[breast_cancer.bare_nuclei=='?']

Unnamed: 0,id_number,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [9]:
breast_cancer.bare_nuclei = breast_cancer.bare_nuclei.replace("?", np.NaN)

In [10]:
breast_cancer.bare_nuclei.value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

In [11]:
breast_cancer.bare_nuclei.mode()[0]

'1'

In [12]:
breast_cancer.bare_nuclei = breast_cancer.bare_nuclei.fillna(breast_cancer.bare_nuclei.mode()[0])

In [13]:
breast_cancer["class"].value_counts()

2    458
4    241
Name: class, dtype: int64

In [14]:
breast_cancer["cancer_ind"] = 0
breast_cancer.loc[breast_cancer["class"] == 4, "cancer_ind"] = 1

In [15]:
breast_cancer.cancer_ind.value_counts()

0    458
1    241
Name: cancer_ind, dtype: int64

In [16]:
X = breast_cancer.drop(["id_number", "class", "cancer_ind"], axis=1)
y = breast_cancer.cancer_ind

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
pkl_data = (X_train, y_train, X_test, y_test)
pd.to_pickle(pkl_data, 'data/cancer.pkl')

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [22]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve 

In [24]:
y_pred = knn.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.9761904761904762
[[141   2]
 [  3  64]]
0.9706189333055005
