# A very common ML algorithm for Breast Cancer Classifier


In [37]:
%matplotlib inline
import random
import matplotlib.pyplot
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Loading and Examining Data

In [24]:
df = pd.read_csv('breast-cancer-wisconsin.data')
df.head()

Unnamed: 0,id_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [25]:
print("Data Set size: ", len(df))

Data Set size:  699


In [26]:
df.dtypes

id_number                        int64
clump_thickness                  int64
uniformity_of_cell_size          int64
uniformity_of_cell_shape         int64
marginal_adhesion                int64
 single_epithelial_cell_size     int64
bare_nuclei                     object
bland_chromatin                  int64
normal_nucleoli                  int64
mitoses                          int64
class                            int64
dtype: object

## Preprocessing

In [27]:
df.replace('?', -99999, inplace=True)        # Replace missing data
df.drop('id_number', 1, inplace=True)    # Removing unnecessary fields
df['bare_nuclei'] = pd.to_numeric(df['bare_nuclei'])

df

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


## Training Testing Split

In [28]:
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Classfier

In [30]:
clf = KNeighborsClassifier()

In [31]:
clf.fit(X_train, y_train)

KNeighborsClassifier()

In [33]:
accuracy = clf.score(X_test, y_test)
accuracy

0.9809523809523809

## Testing

### Generating some random data

In [49]:
number_of_tests = 10

In [50]:
np.random.seed(42)

In [65]:
test_sample = np.array(random.sample(range(0, 11), 9))
test_sample

array([10,  1,  8,  4,  6,  5,  2,  0,  9])

In [69]:
for i in range(number_of_tests):
    test_sample = np.array(random.sample(range(0, 11), 9))
    print(test_sample)
    test_sample = test_sample.reshape(1, -1)
    prediction = clf.predict(test_sample)
    print(prediction)

[ 5  1  3 10  2  9  8  6  4]
[4]
[10  8  0  4  5  9  1  2  6]
[4]
[ 1  8  2  4  9  7 10  6  0]
[4]
[10  4  8  7  2  0  5  3  1]
[4]
[ 0 10  5  2  8  7  1  3  6]
[4]
[ 6  8  0  1  9  5  7 10  4]
[4]
[ 9  8  2  6  1  0 10  4  5]
[4]
[5 3 9 1 2 4 8 7 0]
[4]
[ 2 10  6  0  1  5  9  3  4]
[4]
[ 3  4  2  1 10  0  6  7  5]
[2]


In [70]:
example_measures = np.array([4,2,1,1,1,2,3,2,1])
example_measures = example_measures.reshape(1, -1)
prediction = clf.predict(example_measures)
print(prediction)

[2]
