In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
pd.set_option('precision', 2)
pd.set_option('display.max_columns', 1000)
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
dataset = load_breast_cancer()
features = pd.DataFrame(dataset.data)
labels = pd.DataFrame(dataset.target)
data = pd.concat([features, labels], axis = 1)
data.columns = np.concatenate([dataset.feature_names, np.array(['class'])])
data.describe()

#### Generally floating point values with various ranges. Lets look at datatypes of each column

In [None]:
data.info()

#### Total 30 features each with float values. Lets see if there is any missing data

In [None]:
data.isnull().any()

#### No missing data. Lets scale features to standardize data.

In [None]:
scaled = StandardScaler().fit_transform(features)

#### No separate test data. Hence split data into train and test set.

In [None]:
train_features \
, test_features \
, train_labels \
, test_labels = train_test_split(scaled \
                                 , labels \
                                 , test_size = 0.2 \
                                 , random_state = 73)

#### Try multiple models to check their performance on training data

In [None]:
models = {'lr': LogisticRegression() \
          , 'svm': SVC() \
          , 'dt': DecisionTreeClassifier() \
          , 'rf': RandomForestClassifier() \
          , 'knn': KNeighborsClassifier() \
          , 'nb': GaussianNB() \
         }

#### Use K fold cross validation to compare among models

In [None]:
for name, model in models.items():
    kfold = KFold(n_splits = 10, random_state = 73)
    cv_results = cross_val_score(model, train_features, train_labels, cv = kfold, scoring = 'accuracy')
    print("%s: %.4f %.4f" % (name, cv_results.mean(), cv_results.std()))
    

#### Perform hyperparameter tuning of best performing model

In [None]:
c = 0.01
print("%-10s %-10s %-10s" % ('c', 'accuracy', 'variance'))
while c <= 10:
    model = LogisticRegression(C = c, random_state = 73)
    kfold = KFold(n_splits = 10, random_state = 73)
    cv_results = cross_val_score(model, train_features, train_labels, cv = kfold, scoring = 'accuracy')
    print("%-10.2f %-10.4f %-10.4f" % (c, cv_results.mean(), cv_results.std()))
    c = c * 3
    

#### Not much improvement with Logistic Regression. Maybe the model is not complex enough to learn more. Lets try the next best performing more complex model SVM

In [None]:
print("%-10s %-10s %-10s %-10s" % ('kernel', 'c', 'accuracy', 'variance'))

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for kernel in kernels:
    c = 0.01
    while c <= 10:
        model = SVC(C = c, kernel = k, random_state = 73)
        kfold = KFold(n_splits = 10, random_state = 73)
        cv_results = cross_val_score(model, train_features, train_labels, cv = kfold, scoring = 'accuracy')
        print("%-10s %-10.2f %-10.4f %-10.4f" % (kernel, c, cv_results.mean(), cv_results.std()))
        c = c * 3
    

#### Kernel choice did not matter much. We will use default RBF kernel. Best c value 0.09

In [None]:
model = SVC(C = 0.09, random_state = 73)
model.fit(train_features, train_labels)

#### Lets look at performance on test data

In [None]:
predictions = model.predict(test_features)
print('accuracy:', accuracy_score(test_labels, predictions))

#### Finally classification report and confusion matrix

In [None]:
print(classification_report(test_labels, predictions))

In [None]:
confusion_matrix(test_labels, predictions)