# Classifier Visualisation

The purpose of this notebook is to let you visualize various classsifiers' decision boundaries.

The data used in this notebook is based on the [UCI Mushroom Data Set](http://archive.ics.uci.edu/ml/datasets/Mushroom?ref=datanews.io) stored in `mushrooms.csv`. 

In order to better vizualize the decision boundaries, we'll perform Principal Component Analysis (PCA) on the data to reduce the dimensionality to 2 dimensions. Dimensionality reduction will be covered in module 4 of this course.

---

_You are currently looking at **version 1.0** of this notebook._

---

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [None]:
path = !find ../.. | grep -i mushrooms.csv
path

In [None]:
df = pd.read_csv(path[0])
df.info()
df.sample(5)

### Dummify features

In [None]:
df2 = pd.get_dummies(df)
df2.shape
df2.sample(5)

### Sample set

In [None]:
df3 = df2.sample(frac=0.2)

X = df3.iloc[:, 2:]
y = df3.iloc[:, 1]
list(map(np.shape, (X, y)))

### PCA

In [None]:
pca = PCA(n_components=2).fit_transform(X)
pca

### Train-test split (on PCA features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pca, y, random_state=0)

In [None]:
def plot_boundary(X, y, fitted_model):

    fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharex=True, sharey=True)
    titles = ['Mushroom Dataset', 'Decision Boundary', 'Decision Probabilities']
    
    for i, plot_type, ax in zip(range(3), titles, axes):

        # Setup grid
        mesh_step_size = 0.01  # step size in the mesh
        ax_xpad, ax_ypad = 1, 1
        x_min, x_max = X[:, 0].min() - ax_xpad, X[:, 0].max() + ax_xpad
        y_min, y_max = X[:, 1].min() - ax_ypad, X[:, 1].max() + ax_ypad
        xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size), np.arange(y_min, y_max, mesh_step_size))
        
        # Compute Decision Boundary & Decision Probabilities(if available)
        if i == 1:
            Z = fitted_model.predict(np.c_[xx.ravel(), yy.ravel()])
        elif i == 2:
            try:
                Z = fitted_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
            except:   # Decision Probabilities not available
                plt.text(0.4, 0.5, 'Probabilities Unavailable', horizontalalignment='center',
                     verticalalignment='center', transform=plt.gca().transAxes, fontsize=12)
                plt.axis('off')
                break
        
        # Plot PC1 vs PC2 - edible/poisonous
        ax.scatter(X[y.values==0, 0], X[y.values==0, 1], alpha=0.5, label='Edible', s=5)
        ax.scatter(X[y.values==1, 0], X[y.values==1, 1], alpha=0.5, label='Posionous', s=5)
        
        # Plot fill
        if i != 0:
            Z = Z.reshape(xx.shape)
            ax.imshow(Z, interpolation='nearest', cmap='RdYlBu_r', alpha=0.25, 
                       extent=(x_min, x_max, y_min, y_max), origin='lower')
        
        # Title and labels    
        sup_title = 'Model: {}\nTest Accuracy:{}'.format(str(fitted_model).split('(')[0],                                                    
                                                           fitted_model.score(X, y).round(3))

        if i == 0:
            plt.suptitle(sup_title, size=14)
            ax.set_xlabel('PC1', size=14)
            ax.set_ylabel('PC2', size=14)
            
        ax.set_title(plot_type, size=14)  
        ax.set_aspect('equal');
        
    plt.tight_layout()
    plt.subplots_adjust(top=0.8, bottom=0.0, wspace=0.1)

In [None]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=20),
    DecisionTreeClassifier(max_depth=3),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(kernel='linear'),
    SVC(kernel='rbf', C=1),
    SVC(kernel='rbf', C=10),
    GaussianNB(),
    MLPClassifier()
]

for model in classifiers:
    _ = model.fit(X_train, y_train)
    plot_boundary(X_test, y_test, model)