# Model Selection

In [None]:
#numeric: pandas and numpy
import numpy as np
import pandas as pd
# graphics
%matplotlib inline 
import matplotlib.pyplot as plt 
import matplotlib.gridspec as gridspec 

## Toy Data

In [None]:
from sklearn.datasets import make_blobs, make_circles, make_moons, make_circles
X_blobs, y_blobs = make_blobs(n_samples=1000, centers=2, n_features=2, random_state=None)
X_moon,y_moon= make_moons(n_samples=1000, shuffle=True, noise=None, random_state=None)
X_cir, y_cir= make_circles(n_samples=1000, shuffle=True, noise=None, random_state=None, factor=0.4)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3)

ax1.scatter(X_blobs[y_blobs==0,0],X_blobs[y_blobs==0,1],marker='s',color='r',label='0' )
ax1.scatter(X_blobs[y_blobs==1,0],X_blobs[y_blobs==1,1],marker='s',color='g',label='0' )

ax2.scatter(X_moon[y_moon==0,0],X_moon[y_moon==0,1],marker='s',color='r',label='0' )
ax2.scatter(X_moon[y_moon==1,0],X_moon[y_moon==1,1],marker='s',color='g',label='0' )

ax3.scatter(X_cir[y_cir==0,0],X_cir[y_cir==0,1],marker='s',color='r',label='0' )
ax3.scatter(X_cir[y_cir==1,0],X_cir[y_cir==1,1],marker='s',color='g',label='0' )

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.3

#X_train_blobs, X_test_blobs, y_train_blobs, y_test_blobs = train_test_split(X_blobs, y_blobs, test_size=test_size, random_state=0, stratify=y_blobs)
#X_train_moon, X_test_moon, y_train_moon, y_test_moon = train_test_split(X_moon, y_moon, test_size=test_size, random_state=0, stratify=y_moon)
#X_train_cir, X_test_cir, y_train_cir, y_test_cir = train_test_split(X_cir, y_cir, test_size=test_size, random_state=0, stratify=y_cir)

d_blobs = train_test_split(X_blobs, y_blobs, test_size=test_size, random_state=0, stratify=y_blobs)
d_moon = train_test_split(X_moon, y_moon, test_size=test_size, random_state=0, stratify=y_moon)
d_cir = train_test_split(X_cir, y_cir, test_size=test_size, random_state=0, stratify=y_cir)

datasets = [d_blobs, d_moon, d_cir]
names = ['Blobs', 'Moon', 'Circle']

In [None]:
import math
import scipy

# Computes the confusion matrix for the McNemar test
def create_McNemar_matrix(y_test, y_pred1, y_pred2):
    a = b = c = d = 0
    
    for i in range(0, len(y_test)):
        #print('[{}, {}, {}]'.format(y_test[i], y_pred1[i], y_pred2[i]))
        if y_test[i] == y_pred1[i] and y_test[i] == y_pred2[i]:
            a += 1
        elif y_test[i] == y_pred1[i]:
            b += 1
        elif y_test[i] == y_pred2[i]:
            c += 1
        else:
            d += 1
    
    return np.array([[a, b], [c, d]])


# Compute both models accuracy based on the McNemar table
def model_acc_McNemar(matrix):
    n = matrix.sum()
    acc1 = (matrix[0, 0] + matrix[0, 1]) / n
    acc2 = (matrix[0, 0] + matrix[1, 0]) / n
    return (acc1, acc2)


# Computes the 
def McNemar_test(matrix):
    b = matrix[0, 1]
    c = matrix[1, 0]
    n = b + c
    
    if n > 0:
        chi2 = ((math.fabs(b-c)-1.0)**2) / (b+c)
    else:
        chi2 = ((math.fabs(b-c)-1.0)**2)
    
    p = min(scipy.stats.binom.cdf(min(b, c), n, 0.5) * 2.0, 1.)
    
    return (chi2, p)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

scaler = StandardScaler()

# significance threshold
a = 0.05

# Create a perceptron and assign hyperparameters (max_iter, eta0- learning rate)
ppn = Perceptron(penalty=None, alpha=0.0001, fit_intercept=True, tol=None, 
                 eta0=0.1, n_jobs=1, random_state=0, class_weight=None, warm_start=False)
# Create a MLP and assign hyperparameters (max_iter, eta0- learning rate)
mlp = MLPClassifier(alpha=0.0001, random_state=0, max_iter=1000)

# Create a linear SVM
svm_linear = SVC(kernel='linear')

# Create a RBF SVM
svm_rbf = SVC(kernel='rbf')

models = [(svm_linear, svm_rbf), (ppn, mlp)]
names_models = [('SVM(Linear)', 'SVM(RBF)'), ('Perceptron', 'Multilayer Neural Network')]

for i in range(0, len(datasets)):
    (X_train, X_test, y_train, y_test) = datasets[i]
    print('Dataset: {}'.format(names[i]))
    Xs = scaler.fit_transform(X_train)
    Xtest = scaler.transform(X_test)
    
    for j in range(0, len(models)):
        model1, model2 = models[j]
        print('Comparing Models {} & {}'.format(names_models[j][0], names_models[j][1]))
            
        # Model 1 Learning
        model1.fit(Xs, y_train)
        y_pred_1 = model1.predict(Xtest)
        
        # Model 2 Learning
        model2.fit(Xs, y_train)
        y_pred_2 = model2.predict(Xtest)
    
        m = create_McNemar_matrix(y_test, y_pred_1, y_pred_2)
        print(m)
        m1_acc, m2_acc = model_acc_McNemar(m)
        print('Model 1 = {}; Model 2 = {}'.format(m1_acc, m2_acc))
        chi2, p = McNemar_test(m)
    
        if(p > a):
            print('There is no statistical different between the models')
        else:
            if m1_acc > m2_acc:
                print('Model 1 is better thant Model 2 ({})'.format(m1_acc))
            else:
                print('Model 2 is better thant Model 1 ({})'.format(m2_acc))
        print('*********************************')
    print('---------------------------------')