<a href="https://colab.research.google.com/github/brentdevetter/classifiers/blob/master/BreastCancer_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
# Using the Wisconsin Breast Cancer Database found in sklearn, build several 
# classifiers using techniques such as SVM, Random Forest, Naive-Bayes, KNN, etc
#
#
# 2019 BMD

# Import the important libraries and dataset
from sklearn.datasets import load_breast_cancer
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import time

# Load the data set
cancer = load_breast_cancer()

# Select the input features and put into DataFrame
df_features = pd.DataFrame(cancer['data'], columns = cancer['feature_names'])

# Select the target data and put into DataFrame
df_target = pd.DataFrame(cancer['target'], columns = ['Cancer'])

# Show the first few entries
print('df_features')
print(df_features.head(3))
print(df_features.shape)

print('df_target')
print(df_target.head())
print(df_target.shape)


df_features
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38           122.8     1001.0          0.11840   
1        20.57         17.77           132.9     1326.0          0.08474   
2        19.69         21.25           130.0     1203.0          0.10960   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   

   mean fractal dimension           ...             worst radius  \
0                 0.07871           ...                    25.38   
1                 0.05667           ...                    24.99   
2                 0.05999           ...                    23.57   

   worst texture  worst perimeter  worst area  worst smoothness  \
0          17.33            184.6      201

In [0]:
# Split the data into a training set and a test set!
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_features, np.ravel(df_target), test_size = 0.30, random_state=49)

In [35]:
# Stochastic Gradient Descent
# SGD 
from sklearn.linear_model import SGDClassifier
accuracy_all = []
cvs_all = [] 

start = time.time()

model = SGDClassifier(loss='hinge', max_iter=1000, tol=0.1)
model.fit(X = X_train, y = y_train)
prediction = model.predict(X_test)
scores = cross_val_score(model, df_features, np.ravel(df_target), cv=5)

end = time.time()

accuracy_all.append(accuracy_score(prediction, y_test))
cvs_all.append(np.mean(scores))

print("SGD Classifier Accuracy: {0:.2%}".format(accuracy_score(prediction, y_test)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
print("Execution time: {0:.5} seconds".format(end-start))

SGD Classifier Accuracy: 73.68%
Cross validation score: 89.45% (+/- 9.47%)
Execution time: 0.02992 seconds


In [62]:
# Support Vector Machines (SVMs)
from sklearn.svm import SVC, LinearSVC

accuracy_all = []
cvs_all = [] 

start = time.time()

model = SVC(kernel='linear', gamma='auto', random_state=42)
model.fit(X = X_train, y = y_train)
prediction = model.predict(X_test)
scores = cross_val_score(model, df_features, np.ravel(df_target), cv=5)

end = time.time()

accuracy_all.append(accuracy_score(prediction, y_test))
cvs_all.append(np.mean(scores))

print("SVC Classifier Accuracy: {0:.2%}".format(accuracy_score(prediction, y_test)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
print("Execution time: {0:.5} seconds".format(end-start))

SVC Classifier Accuracy: 92.98%
Cross validation score: 94.56% (+/- 3.74%)
Execution time: 9.249 seconds


In [65]:
# Nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier

accuracy_all = []
cvs_all = [] 

start = time.time()

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X = X_train, y = y_train)
prediction = model.predict(X_test)
scores = cross_val_score(model, df_features, np.ravel(df_target), cv=5)

end = time.time()

accuracy_all.append(accuracy_score(prediction, y_test))
cvs_all.append(np.mean(scores))

print("K Nearest Neighbors (5) Classifier Accuracy: {0:.2%}".format(accuracy_score(prediction, y_test)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
print("Execution time: {0:.5} seconds".format(end-start))


K Nearest Neighbors (5) Classifier Accuracy: 91.81%
Cross validation score: 92.80% (+/- 4.26%)
Execution time: 0.031538 seconds


In [68]:
# Naive-Bayes

from sklearn.naive_bayes import GaussianNB

accuracy_all = []
cvs_all = [] 

start = time.time()

model = GaussianNB()
model.fit(X = X_train, y = y_train)
prediction = model.predict(X_test)
scores = cross_val_score(model, df_features, np.ravel(df_target), cv=5)

end = time.time()

accuracy_all.append(accuracy_score(prediction, y_test))
cvs_all.append(np.mean(scores))

print("Naive-Bayes Classifier Accuracy: {0:.2%}".format(accuracy_score(prediction, y_test)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
print("Execution time: {0:.5} seconds".format(end-start))


Naive-Bayes Classifier Accuracy: 91.23%
Cross validation score: 94.04% (+/- 3.11%)
Execution time: 0.019174 seconds


In [71]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

accuracy_all = []
cvs_all = [] 

start = time.time()

model = RandomForestClassifier(n_estimators = 100)
model.fit(X = X_train, y = y_train)
prediction = model.predict(X_test)
scores = cross_val_score(model, df_features, np.ravel(df_target), cv=5)

end = time.time()

accuracy_all.append(accuracy_score(prediction, y_test))
cvs_all.append(np.mean(scores))

print("Random Forest Classifier Accuracy: {0:.2%}".format(accuracy_score(prediction, y_test)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
print("Execution time: {0:.5} seconds".format(end-start))


Random Forest Classifier Accuracy: 94.74%
Cross validation score: 96.15% (+/- 3.71%)
Execution time: 0.8876 seconds


In [74]:
# Grid search example using SVM

from sklearn.metrics import classification_report
model = SVC(kernel='linear', gamma='auto', random_state=42)
model.fit(X = X_train, y = y_train)
prediction = model.predict(X_test)
scores = cross_val_score(model, df_features, np.ravel(df_target), cv=5)

print(classification_report(y_test, prediction))

param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001], 'kernel':['linear']}

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

grid.fit(X_train, y_train)

print('\n')
print('The best parameters are ', grid.best_params_)


              precision    recall  f1-score   support

           0       0.91      0.91      0.91        65
           1       0.94      0.94      0.94       106

   micro avg       0.93      0.93      0.93       171
   macro avg       0.93      0.93      0.93       171
weighted avg       0.93      0.93      0.93       171

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV]  C=0.1, gamma=1, kernel=linear, score=0.9473684210526315, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV]  C=0.1, gamma=1, kernel=linear, score=0.9774436090225563, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV]  C=0.1, gamma=1, kernel=linear, score=0.9696969696969697, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV]  C=0.1, gamma=0.1, kernel=linear, score=0.9473684210526315, total=   0.0s
[CV] C=0.1, gamma=

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  C=0.1, gamma=0.1, kernel=linear, score=0.9696969696969697, total=   0.1s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV]  C=0.1, gamma=0.01, kernel=linear, score=0.9473684210526315, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV]  C=0.1, gamma=0.01, kernel=linear, score=0.9774436090225563, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV]  C=0.1, gamma=0.01, kernel=linear, score=0.9696969696969697, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.9473684210526315, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.9774436090225563, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.9696969696969697, total=   0.0s
[CV] C=0.1, 

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  5.0min finished




The best parameters are  {'C': 100, 'gamma': 1, 'kernel': 'linear'}
