
# Classifier comparison


A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
This should be taken with a grain of salt, as the intuition conveyed by
these examples does not necessarily carry over to real datasets.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.



In [None]:
%matplotlib inline

In [None]:
import warnings

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_moons, make_circles, make_classification

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### Classifiers

In [None]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

In [None]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [None]:
{k:v for k,v in zip(names, classifiers)}

### Create linear dataset with noise

1. create linear separable data set
2. add random noise to X

In [None]:
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)

rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
linearly_separable;

### Datasets: 

1. moons
2. circles
3. linear

In [None]:
datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable]

### Create mesh

In [None]:
def mesh(X, step=.02, margin=.5):
    """Create grid coordinate ranges for x and y; from min-ε upto max+ε, with step"""
    # 2 feature axis
    x = X[:, 0]
    y = X[:, 1]

    x_min, x_max = np.min(x)-margin, np.max(x)+margin
    y_min, y_max = np.min(y)-margin, np.max(y)+margin
    xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step))
    xy_series = np.c_[xx.ravel(), yy.ravel()]
    mesh_dict = {'xx':xx, 'yy':yy, 'xy_series':xy_series, 'xx_min':np.min(xx), 'xx_max':np.max(xx), 'yy_min':np.min(yy), 'yy_max':np.max(yy)}
    return mesh_dict

### Test mesh

In [None]:
mesh_ = mesh(X)

In [None]:
np.testing.assert_almost_equal(mesh_['xx'].min(), mesh_['xx_min'], 5)
np.testing.assert_almost_equal(mesh_['yy'].min(), mesh_['yy_min'], 5)
np.testing.assert_almost_equal(mesh_['xx'].max(), mesh_['xx_max'], 5)
np.testing.assert_almost_equal(mesh_['yy'].max(), mesh_['yy_max'], 5)

### 

In [None]:
def plot_dataset(X_train, X_test, y_train, y_test, ax):
    
    # Plot training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
    
    # Plot testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k')
    
    ax.set_xlim(xx_min, xx_max)
    ax.set_ylim(yy_min, yy_max)
    ax.set_xticks(())
    ax.set_yticks(())
    plt.tight_layout()
                
    return None

### 

### Setup axes and color scheme

In [None]:
figure = plt.figure(figsize=(27, 9))
ax_0, ax_1 = len(datasets), len(classifiers) + 1

# Figure axes position (start @ 1)
ax_i = 1

# Map color to prediction/probability score
cm = plt.cm.RdBu

# Map color to labels
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

### Run classifiers and plot results

In [None]:
figure = plt.figure(figsize=(27, 9))
ax_0, ax_1 = len(datasets), len(classifiers) + 1
ax_i = 1

with warnings.catch_warnings():
    # Ignores ConvergenceWarning for some classifiers which need more iterations to converge
    warnings.simplefilter("ignore")

    # iterate over datasets
    for ds_cnt, ds in enumerate(datasets):

        # preprocess dataset, split into training and test part
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

        # Get mesh vars for each dataset
        mesh_ = mesh(X) # mesh(X_train, X_test, y_train, y_test)
        xx, yy = mesh_['xx'], mesh_['yy']
        xx_min, xx_max, yy_min, yy_max = mesh_['xx_min'], mesh_['xx_max'], mesh_['yy_min'], mesh_['yy_max']
        xy_series = mesh_['xy_series']

        # Setup axes; range and remove ticks
        ax = plt.subplot(ax_0, ax_1, ax_i)
        ax.set_xlim(xx_min, xx_max)
        ax.set_ylim(yy_min, yy_max)
        ax.set_xticks(())
        ax.set_yticks(())

        if ds_cnt == 0:
            ax.set_title("Input data")

        # Plot the training points and testing points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k')

        ax_i += 1

        # iterate over classifiers within the datasets
        for name, clf in zip(names, classifiers):

            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)

            # Get prediction value or probability score for each point in the grid
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(xy_series)
            else:
                Z = clf.predict_proba(xy_series)[:, 1]

            # Setup axes
            ax = plt.subplot(ax_0, ax_1, ax_i)
            ax.set_xlim(xx_min, xx_max)
            ax.set_ylim(yy_min, yy_max)
            ax.set_xticks(())
            ax.set_yticks(())

            # Plot the decision boundaries by assigning a color to each point in the grid(xx, yy)
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot also the training points and testing points
            ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
            ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6)

            if ds_cnt == 0:
                ax.set_title(name)

            # Add score to plot
            ax.text(xx_max - .3, yy_min + .3, ('{:.2f}'.format(score)).lstrip('0'), size=18, weight= 'bold', horizontalalignment='right')

            print('.', end='')
            ax_i += 1
        
plt.tight_layout()
plt.show();
