# Classifier - Cancer diagnosis

---

_You are currently looking at **version 1.0** of this notebook._

---

In [None]:
% matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

### Data

Data set: Breast Cancer Wisconsin (Diagnostic) Database  
The object returned by `load_breast_cancer()` is a scikit-learn Bunch object, which is similar to a dictionary.

`print(cancer.DESCR)` to print the data set description  

### Load data

In [None]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
cancer;

In [None]:
cancer.keys()
cancer['feature_names']

### Convert data to DataFrame

In [None]:
df = pd.DataFrame(cancer['data'])
df.columns = cancer['feature_names']
df['target'] = cancer['target']
df.info()

In [None]:
'malignant', (df['target']==0).sum()
'benign', (df['target']==1).sum()

### Train-test split

In [None]:
X, y = df.iloc[:, :-1], df['target']
list(map(np.shape, (X, y)))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
list(map(np.shape, (X_train, X_test, y_train, y_test)))

### Train KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 1)
fit = knn.fit(X_train, y_train)

### Predict

In [None]:
# Predict the class label using the mean value for each feature
X_mean = df.mean()[:-1].values.reshape(1, -1)
knn.predict(X_mean)
np.mean(y)

In [None]:
knn.predict(X_test)

### Accuracy score

In [None]:
knn.score(X_test, y_test)

### Plot accuracy

Visualise the predicition scores between training and test sets, as well as malignant and benign cells.

In [None]:
def accuracy_plot():

    global X_train, X_test, y_train, y_test, knn

    # Find the training and testing accuracies by target value (i.e. malignant, benign)
    mal_train_X = X_train[y_train==0]
    mal_train_y = y_train[y_train==0]
    ben_train_X = X_train[y_train==1]
    ben_train_y = y_train[y_train==1]

    mal_test_X = X_test[y_test==0]
    mal_test_y = y_test[y_test==0]
    ben_test_X = X_test[y_test==1]
    ben_test_y = y_test[y_test==1]

    scores = [knn.score(mal_train_X, mal_train_y), knn.score(ben_train_X, ben_train_y), 
              knn.score(mal_test_X, mal_test_y), knn.score(ben_test_X, ben_test_y)]

    plt.figure()

    # Plot the scores as a bar chart
    bars = plt.bar(np.arange(4), scores, color=['#4c72b0','#4c72b0','#55a868','#55a868'])

    # directly label the score onto the bars
    for bar in bars:
        height = bar.get_height()
        plt.gca().text(bar.get_x() + bar.get_width()/2, height*.90, '{0:.{1}f}'.format(height, 2), 
                     ha='center', color='w', fontsize=11)

    # remove all the ticks (both axes), and tick labels on the Y axis
    plt.tick_params(top='off', bottom='off', left='off', right='off', labelleft='off', labelbottom='on')

    # remove the frame of the chart
    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    plt.xticks([0,1,2,3], 
               ['Malignant\nTraining', 'Benign\nTraining', 'Malignant\nTest', 'Benign\nTest'], alpha=0.8)
    plt.title('Training and Test Accuracies for Malignant and Benign Cells', alpha=0.8);

In [None]:
accuracy_plot() 