In [105]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn import preprocessing,neighbors
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression, LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [106]:
df = pd.read_csv('breast-cancer-wisconsin.data')

In [107]:
df = df.replace('?',-99999)

In [108]:
df.shape

(699, 11)

In [109]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhension,single_epith_cell_size,bare_nuclei,blam_chrom,norm_nocleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [110]:
df.isnull().sum() #check for missing values

id                        0
clump_thickness           0
unif_cell_size            0
unif_cell_shape           0
marg_adhension            0
single_epith_cell_size    0
bare_nuclei               0
blam_chrom                0
norm_nocleoli             0
mitoses                   0
class                     0
dtype: int64

In [111]:
df['class'].value_counts() 
#binary classification problem

2    458
4    241
Name: class, dtype: int64

In [112]:
df.describe()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhension,single_epith_cell_size,blam_chrom,norm_nocleoli,mitoses,class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      699 non-null    int64 
 1   clump_thickness         699 non-null    int64 
 2   unif_cell_size          699 non-null    int64 
 3   unif_cell_shape         699 non-null    int64 
 4   marg_adhension          699 non-null    int64 
 5   single_epith_cell_size  699 non-null    int64 
 6   bare_nuclei             699 non-null    object
 7   blam_chrom              699 non-null    int64 
 8   norm_nocleoli           699 non-null    int64 
 9   mitoses                 699 non-null    int64 
 10  class                   699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [114]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhension,single_epith_cell_size,bare_nuclei,blam_chrom,norm_nocleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [115]:
#Remove patient label feature
df = df.drop('id',axis=1)


In [116]:
df.head()

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhension,single_epith_cell_size,bare_nuclei,blam_chrom,norm_nocleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [117]:
# set training and target data
X = df.drop('class',axis = 1)
y = df['class']

In [118]:
# split data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [119]:
# K-Neighbour classification
knc = neighbors.KNeighborsClassifier()
knc.fit(X_train,y_train)
y_pred = knc.predict(X_test)

accuracy = knc.score(X_test,y_test)
print(f'K_classification test_data accuracy = {accuracy :.5f}')
print()
print(f'K_classification Confusion matrix: \n {confusion_matrix(y_test,y_pred)}')


K_classification accuracy = 0.97143

K_classification Confusion matrix: 
 [[86  3]
 [ 1 50]]


In [123]:
# Logistic regression
lg = LogisticRegression()
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)
accuracy = lg.score(X_test,y_test)


print(f'K_classification test_data accuracy = {accuracy :.5f}')
print()
print(f'K_classification Confusion matrix: \n {confusion_matrix(y_test,y_pred)}')

K_classification test_data accuracy = 0.92143

K_classification Confusion matrix: 
 [[83  6]
 [ 5 46]]
