In [1]:
# step 1 ----->> import lib

import pandas as pd
import numpy as np
from sklearn import neighbors, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# step 2 :  import data set
data = pd.read_csv('breast-cancer-wisconsin.data')

In [3]:
data.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
data.tail()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4
698,897471,4,8,8,5,4,5,10,4,1,4


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                        699 non-null int64
clump_thickness           699 non-null int64
unif_cell_size            699 non-null int64
unif_cell_shape           699 non-null int64
marg_adhesion             699 non-null int64
single_epith_cell_size    699 non-null int64
bare_nuclei               699 non-null object
bland_chrom               699 non-null int64
norm_nucleoli             699 non-null int64
mitoses                   699 non-null int64
class                     699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [5]:
# delete the unwanted id column
data.drop(['id'],axis =1,inplace=True)
data.head()

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [6]:
# make up for missing entries
data.replace('?', -9999,inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
clump_thickness           699 non-null int64
unif_cell_size            699 non-null int64
unif_cell_shape           699 non-null int64
marg_adhesion             699 non-null int64
single_epith_cell_size    699 non-null int64
bare_nuclei               699 non-null object
bland_chrom               699 non-null int64
norm_nucleoli             699 non-null int64
mitoses                   699 non-null int64
class                     699 non-null int64
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [7]:
# get our attributes and classes in place
X = np.array(data.drop(['class'], axis =1))
y = np.array(data['class'])
print(X.shape,"  ",X.ndim)
print(y.shape,"  ",y.ndim)


(699, 9)    2
(699,)    1


In [8]:
# split data into training and testing sections
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [9]:
# initialize our classifier
knn = neighbors.KNeighborsClassifier()


In [10]:
y_train =y_train[:,np.newaxis]


In [11]:
y_train.ndim

2

In [12]:
X

array([[5, 1, 1, ..., 3, 1, 1],
       [5, 4, 4, ..., 3, 2, 1],
       [3, 1, 1, ..., 3, 1, 1],
       ...,
       [5, 10, 10, ..., 8, 10, 2],
       [4, 8, 6, ..., 10, 6, 1],
       [4, 8, 8, ..., 10, 4, 1]], dtype=object)

In [13]:
knn.fit(X_train,y_train)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [14]:
# calculating accuracy with test data
accuracy = knn.score(X_test, y_test)
accuracy

0.9571428571428572

In [15]:
# let's make a prediction
new_tests = np.array([[10, 10, 2, 3, 10, 2, 1, 8, 44], [10, 1, 12, 3, 1, 12, 1, 8, 12], [3, 1, 1, 3, 1, 12, 1, 2, 1]])

In [16]:
new_tests = new_tests.reshape(len(new_tests), -1)

In [17]:
new_tests.ndim

2

In [18]:
prediction = knn.predict(new_tests)

In [19]:
for clas in prediction:
    if(clas == 2):
        print("benign")
    else:
        print("Maligant")

Maligant
Maligant
benign


In [20]:
y_pre = knn.predict(X_test)
y_pre

array([4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 4,
       4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2,
       2, 2, 4, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2,
       4, 4, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 2, 2, 2,
       4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4, 2, 2, 2, 2, 2, 4,
       4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 4,
       2, 2, 2, 2, 4, 2, 2, 2], dtype=int64)