##Classifying WBCD using KNN, NB and LVQ

##Importing data

In [53]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

df = pd.read_csv("wbcd.csv")
df['Class'] = df['Class'].replace({2:0, 4:1})
df

Unnamed: 0,ClumpThickness,UniformityOfCellSize,UniformityofCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,0
695,2,1,1,1,2,1,1,1,1,0
696,5,10,10,3,7,3,8,10,2,1
697,4,8,6,4,3,4,10,6,1,1


###Data cleaning

>Create two datasets
- With removed missing values and
- With replaced missing values with 1

In [54]:
#Removed missing values
df_non_empty = df.copy(deep=True)
df_non_empty = df_non_empty[df_non_empty['BareNuclei'] != '?'].astype({'BareNuclei': 'int64'})

#Replaced missing values
df_replaced = df.copy(deep=True)
df_replaced['BareNuclei']  = df_replaced['BareNuclei'].replace({'?': 1})
df_replaced = df_replaced.astype({'BareNuclei': 'int64'})

In [55]:
X_1 = df_non_empty[['ClumpThickness'] + ['UniformityOfCellSize'] + ['UniformityofCellShape'] + ['MarginalAdhesion'] + ['SingleEpithelialCellSize'] + ['BareNuclei'] + ['BlandChromatin'] + ['NormalNucleoli'] + ['Mitoses']]
X_2 = df_replaced[['ClumpThickness'] + ['UniformityOfCellSize'] + ['UniformityofCellShape'] + ['MarginalAdhesion'] + ['SingleEpithelialCellSize'] + ['BareNuclei'] + ['BlandChromatin'] + ['NormalNucleoli'] + ['Mitoses']]

###Splitting into training and test sets

In [56]:
from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1,df_non_empty.Class,test_size = 0.3)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2,df_replaced.Class,test_size = 0.3)

Scaling for used in algorithms like KNN

In [57]:
from sklearn.preprocessing import StandardScaler
scaler_1 = StandardScaler()
scaler_1.fit(X_train_1)
X_train_1 = scaler_1.transform(X_train_1)
X_test_1 = scaler_1.transform(X_test_1)

scaler_2 = StandardScaler()
scaler_2.fit(X_train_2)
X_train_2 = scaler_2.transform(X_train_2)
X_test_2 = scaler_2.transform(X_test_2)

###Classifier - **KNN**

In [58]:
from sklearn.neighbors import KNeighborsClassifier
knnc_1 = KNeighborsClassifier(n_neighbors=3)
knnc_1.fit(X_train_1, y_train_1)

knnc_2 = KNeighborsClassifier(n_neighbors=3)
knnc_2.fit(X_train_2, y_train_2)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

>Accuracy for removed missing values data set

In [59]:
knnc_1.score(X_test_1, y_test_1)

0.9463414634146341

>Accuracy for replaced missing values data set

In [60]:
knnc_1.score(X_test_2, y_test_2)

0.9714285714285714

###Classifier - **NB**

In [67]:
from sklearn.naive_bayes import GaussianNB
gnbc_1 = GaussianNB()
gnbc_1.fit(X_train_1, y_train_1)

gnbc_2 = GaussianNB()
gnbc_2.fit(X_train_2, y_train_2)

GaussianNB(priors=None, var_smoothing=1e-09)

>Accuracy for removed missing values data set

In [62]:
gnbc_1.score(X_test_1, y_test_1)

0.9365853658536586

>Accuracy for replaced missing values data set

In [63]:
gnbc_2.score(X_test_2, y_test_2)

0.9619047619047619

###Classifier - **LVQ**

In [64]:
!pip install neupy
from neupy import algorithms
from sklearn import metrics

lvq_1 = algorithms.LVQ(n_inputs=9, n_classes=2)
lvq_1.train(X_train_1, y_train_1, epochs=100)

lvq_2 = algorithms.LVQ(n_inputs=9, n_classes=2)
lvq_2.train(X_train_2, y_train_2, epochs=100)





>Accuracy for removed missing values data set

In [65]:
metrics.accuracy_score(y_test_1, lvq_1.predict(X_test_1))

0.9414634146341463

>Accuracy for replaced missing values data set

In [66]:
metrics.accuracy_score(y_test_2, lvq_2.predict(X_test_2))

0.9571428571428572