In [68]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

#algorithms
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [69]:
data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=345)

In [70]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [83]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=10000)),
    ('SVM', SVC(probability=True)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('k-NN', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Neural Network', MLPClassifier(max_iter=1000))
]

In [84]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=345)

In [85]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

In [81]:
results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    accuracy = cv_results.mean()
    recall = cross_val_score(model, X_train, y_train, cv=cv, scoring='recall').mean()
    precision = cross_val_score(model, X_train, y_train, cv=cv, scoring='precision').mean()
    f1 = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1').mean()
    roc_auc = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc').mean()
    results.append((accuracy, recall, precision, f1, roc_auc))
    names.append(name)

# DataFrame with results
results_df = pd.DataFrame(results, columns=['CV Accuracy', 'CV Recall', 'CV Precision', 'CV F1 Score', 'CV ROC AUC'], index=names)
results_df

Unnamed: 0,CV Accuracy,CV Recall,CV Precision,CV F1 Score,CV ROC AUC
Logistic Regression,0.98022,0.992982,0.976441,0.984404,0.996491
SVM,0.978022,0.992982,0.97322,0.982724,0.996594
Decision Tree,0.934066,0.929825,0.961542,0.955839,0.943705
Random Forest,0.958242,0.975439,0.965965,0.973906,0.990093
k-NN,0.967033,0.989474,0.960405,0.974289,0.981682
Gradient Boosting,0.953846,0.961404,0.968838,0.964747,0.990918
Naive Bayes,0.936264,0.954386,0.945058,0.949397,0.987307
Neural Network,0.967033,0.982456,0.970106,0.980711,0.995769


Hyper parameter tuning

In [75]:
clf =  GridSearchCV(SVC(), {
    'C': [1,2,3,4,5,10,20],
    'kernel': ['linear', 'rbf'],
    'gamma': [1, 0.1, 0.01]
}, cv=5, scoring = ['accuracy', 'recall'], refit=False, return_train_score=True)
clf.fit(cancer.data, cancer.target)

df1 = pd.DataFrame(clf.cv_results_,columns=['param_C','param_kernel','param_gamma','mean_test_recall'])
df1

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_recall
0,1,linear,1.0,0.96921
1,1,rbf,1.0,1.0
2,1,linear,0.1,0.96921
3,1,rbf,0.1,1.0
4,1,linear,0.01,0.96921
5,1,rbf,0.01,0.997183
6,2,linear,1.0,0.97766
7,2,rbf,1.0,1.0
8,2,linear,0.1,0.97766
9,2,rbf,0.1,1.0


In [76]:
df1.sort_values(by=['mean_test_recall'], ascending=False).head(15)

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_recall
21,4,rbf,0.1,1.0
33,10,rbf,0.1,1.0
15,3,rbf,0.1,1.0
25,5,rbf,1.0,1.0
13,3,rbf,1.0,1.0
27,5,rbf,0.1,1.0
31,10,rbf,1.0,1.0
19,4,rbf,1.0,1.0
9,2,rbf,0.1,1.0
7,2,rbf,1.0,1.0


In [77]:
clf =  GridSearchCV(KNeighborsClassifier(), {
    'n_neighbors': [1,3,5,7,9],
    'weights': ['uniform', 'distance'],
    'p': [1,2]
}, cv=5, scoring = ['accuracy', 'recall'], refit=False, return_train_score=True)
clf.fit(cancer.data, cancer.target)

df2 = pd.DataFrame(clf.cv_results_,columns=['param_n_neighbors','param_weights','param_p','mean_test_recall'])
df2

Unnamed: 0,param_n_neighbors,param_weights,param_p,mean_test_recall
0,1,uniform,1,0.957903
1,1,distance,1,0.957903
2,1,uniform,2,0.941197
3,1,distance,2,0.941197
4,3,uniform,1,0.963537
5,3,distance,1,0.966315
6,3,uniform,2,0.952308
7,3,distance,2,0.957942
8,5,uniform,1,0.969092
9,5,distance,1,0.969092


In [78]:
df2.sort_values(by=['mean_test_recall'], ascending=False).head()

Unnamed: 0,param_n_neighbors,param_weights,param_p,mean_test_recall
16,9,uniform,1,0.977504
17,9,distance,1,0.974687
12,7,uniform,1,0.971909
13,7,distance,1,0.969092
8,5,uniform,1,0.969092
