#  Import libraries

In [16]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier

# Read and visualize wisconsin dataset

In [2]:
dataset = pd.read_csv('data.csv')
dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

# Convert labels to numeric variables (0,1)

In [4]:
dataset["diagnosis"] = dataset["diagnosis"].map({'B':0,'M':1})
dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [5]:
dataset.isna().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

# Drop unnecessary features

In [6]:
dataset.drop(['id','Unnamed: 32'],axis = 1 , inplace = True)

In [7]:
dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Assign important features to x and labels to y

In [8]:
x = dataset[['radius_mean','perimeter_mean','area_mean','compactness_mean','concave points_mean','radius_se','perimeter_se', 'area_se','compactness_se', 'concave points_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','texture_worst','area_worst']]
y = dataset[['diagnosis']]

# Split data to train data(70%) and test data(30%)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

# K Nearest Neighbors (KNN)
### Try to find optimal number of k 

In [10]:
neighbors = []
cv_scores = []
for k in range(1,40,2):
    neighbors.append(k)
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn,x_train,y_train,cv = 10, scoring = 'accuracy')
    cv_scores.append(scores.mean())
scores = cross_val_score(knn, x, y , cv = 10 , scoring = 'accuracy')

MSE = [1-x for x in cv_scores]
optimal_k = neighbors[MSE.index(min(MSE))]
print('optimal number of K neighbors {}'.format(optimal_k))

optimal number of K neighbors 13


### Train KNN model and evaluate model prediction

In [11]:
model = KNeighborsClassifier(n_neighbors = 13)
model.fit(x_train, y_train)
predict = model.predict(x_test)
print("KNN prediction accuracy : {}".format(accuracy_score(predict,y_test)*100))
print("KNN prediction recall : {}".format(recall_score(predict,y_test)*100))
print("KNN prediction precision : {}".format(precision_score(predict,y_test)*100))
print("KNN prediction f1 score : {}".format(f1_score(predict,y_test)*100))

KNN prediction accuracy : 92.98245614035088
KNN prediction recall : 94.73684210526315
KNN prediction precision : 85.71428571428571
KNN prediction f1 score : 90.0


# Random Forest Algorithm

### Train Random Forest model and evaluate model prediction

In [12]:
model = RandomForestClassifier(max_depth = 6,random_state = 1)
model.fit(x_train,y_train)
predict = model.predict(x_test)

print("Random Forest prediction accuracy : {}".format(accuracy_score(predict,y_test)*100))
print("Random Forest prediction recall : {}".format(recall_score(predict,y_test)*100))
print("Random Forest prediction precision : {}".format(precision_score(predict,y_test)*100))
print("Random Forest prediction f1 score : {}".format(f1_score(predict,y_test)*100))

Random Forest prediction accuracy : 94.73684210526315
Random Forest prediction recall : 95.0
Random Forest prediction precision : 90.47619047619048
Random Forest prediction f1 score : 92.68292682926828


# Naive Bayes

### Train Gaussian Naive Bayes model and evaluate model prediction

In [13]:
model = GaussianNB()
model.fit(x_train, y_train)
predict = model.predict(x_test)

print("GaussianNB prediction accuracy : {}".format(accuracy_score(predict,y_test)*100))
print("GaussianNB prediction recall : {}".format(recall_score(predict,y_test)*100))
print("GaussianNB prediction precision : {}".format(precision_score(predict,y_test)*100))
print("GaussianNB prediction f1 score : {}".format(f1_score(predict,y_test)*100))

GaussianNB prediction accuracy : 92.98245614035088
GaussianNB prediction recall : 94.73684210526315
GaussianNB prediction precision : 85.71428571428571
GaussianNB prediction f1 score : 90.0


### Train Multinomial Naive Bayes model and evaluate model prediction

In [14]:
model = MultinomialNB()
model.fit(x_train, y_train)
predict = model.predict(x_test)

print("MultinomialNB prediction accuracy : {}".format(accuracy_score(predict,y_test)*100))
print("MultinomialNB prediction recall : {}".format(recall_score(predict,y_test)*100))
print("MultinomialNB prediction precision : {}".format(precision_score(predict,y_test)*100))
print("MultinomialNB prediction f1 score : {}".format(f1_score(predict,y_test)*100))



MultinomialNB prediction accuracy : 90.05847953216374
MultinomialNB prediction recall : 96.0
MultinomialNB prediction precision : 76.19047619047619
MultinomialNB prediction f1 score : 84.95575221238937


# Support Vector Machine (SVM)

### Train SVM model and evaluate model prediction 

In [15]:
model = SVC()
model.fit(x_train, y_train)
predict = model.predict(x_test)

print("SVM prediction accuracy : {}".format(accuracy_score(predict,y_test)*100))
print("SVM prediction recall : {}".format(recall_score(predict,y_test)*100))
print("SVM prediction precision : {}".format(precision_score(predict,y_test)*100))
print("SVM prediction f1 score : {}".format(f1_score(predict,y_test)*100))

SVM prediction accuracy : 91.22807017543859
SVM prediction recall : 94.44444444444444
SVM prediction precision : 80.95238095238095
SVM prediction f1 score : 87.17948717948718
