# Classification Machine Learning Models for Breast Cancer Data



Used various classification machine learning models to predict the likelihood that someone will have breast cancer based on lab results. The data does not require feature scaling since the data is already scaled properly.



## Importing the libraries

In [32]:
import pandas as pd

## Importing the dataset

In [33]:
dataset = pd.read_csv('breast_cancer.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the models on the Training set


In [35]:

#Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)

#K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)

#Support Vector Machine
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, y_train)

#Kernel Support Vector Machine
#SVC was imported from sklearn.svm above
ksvm = SVC(kernel = 'rbf', random_state = 0)
ksvm.fit(X_train, y_train)

#Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

#Decision Tree Classification
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc.fit(X_train, y_train)

#Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfc.fit(X_train, y_train)


## Making the Confusion Matrices

In [36]:
#Logistic Regression
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_lr = lr.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(cm_lr)
accuracy_score(y_test, y_pred_lr)

[[84  3]
 [ 3 47]]


0.9562043795620438

In [37]:
#K-Nearest Neighbors
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_knn = knn.predict(X_test)
cm_knn = confusion_matrix(y_test, y_pred_knn)
print(cm_knn)
accuracy_score(y_test, y_pred_knn)

[[84  3]
 [ 1 49]]


0.9708029197080292

In [38]:
#Support Vector Machine
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_svm = svm.predict(X_test)
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)
accuracy_score(y_test, y_pred_svm)

[[83  4]
 [ 2 48]]


0.9562043795620438

In [39]:
#Kernel Support Vector Machine
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_ksvm = ksvm.predict(X_test)
cm_ksvm = confusion_matrix(y_test, y_pred_ksvm)
print(cm_ksvm)
accuracy_score(y_test, y_pred_ksvm)

[[83  4]
 [ 1 49]]


0.9635036496350365

In [40]:
#Naive Bayes
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_nb = nb.predict(X_test)
cm_nb = confusion_matrix(y_test, y_pred_nb)
print(cm_nb)
accuracy_score(y_test, y_pred_nb)

[[80  7]
 [ 0 50]]


0.948905109489051

In [41]:
#Decision Tree Classification
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_dtc = dtc.predict(X_test)
cm_dtc = confusion_matrix(y_test, y_pred_dtc)
print(cm_dtc)
accuracy_score(y_test, y_pred_dtc)

[[84  3]
 [ 3 47]]


0.9562043795620438

In [42]:
#Random Forest Classification
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_rfc = rfc.predict(X_test)
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
print(cm_rfc)
accuracy_score(y_test, y_pred_rfc)

[[83  4]
 [ 3 47]]


0.948905109489051

#####  Note: False negatives are important to consider as they would falsely say that someone does not have cancer when they do.Based on the confusion matrices, the KNN, KSVM, and Naive Bayes models had the least false-negatives. However, according to the confusion matrices, the KNN model has the best overall accuracy.

## Computing the accuracy with k-Fold Cross Validation for all the models


In [43]:
#Logistic Regression
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = lr, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
print('()')

Accuracy: 96.70 %
Standard Deviation: 1.97 %
()


In [44]:
#K-Nearest Neighbors
#Already immported the cross_val_score
accuracies = cross_val_score(estimator = lr, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
print('()')

Accuracy: 96.70 %
Standard Deviation: 1.97 %
()


In [45]:
#Support Vector Machine
#Already immported the cross_val_score
accuracies = cross_val_score(estimator = svm, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
print('()')

Accuracy: 97.07 %
Standard Deviation: 2.19 %
()


In [46]:
#Kernel Support Vector Machine
#Already immported the cross_val_score
accuracies = cross_val_score(estimator = ksvm, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
print('()')

Accuracy: 96.89 %
Standard Deviation: 2.17 %
()


In [47]:
#Naive Bayes
#Already immported the cross_val_score
accuracies = cross_val_score(estimator = nb, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
print('()')

Accuracy: 96.52 %
Standard Deviation: 2.24 %
()


In [48]:
#Decision Tree Classification
#Already immported the cross_val_score
accuracies = cross_val_score(estimator = dtc, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
print('()')

Accuracy: 94.33 %
Standard Deviation: 2.65 %
()


In [50]:
#Random Forest Classification
#Already immported the cross_val_score
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
print('()')

Accuracy: 96.33 %
Standard Deviation: 2.01 %
()
