## Simulate expected misclassification rate

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import multivariate_normal
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

%matplotlib inline
%config InlineBackend.figure_formats = {'retina',}
plt.style.use('seaborn-white')

#### Model to be used for generating data with two classes:
<BR>
$y_i = 0,\quad x_i \sim N_{10}(0,  I_{10})$

$y_i = 1,\quad x_i \sim N_{10}(\mu,  I_{10})\,$ with $\mu = (1,1,1,1,1,0,0,0,0,0)$

$x_i \in \mathbb{R^{10}}$ normally distributed

$y_i$ equally divided between the two classes (balanced dataset)
<BR>

### Create Gaussian Normal

In [None]:
mu1 = np.repeat(0, 10)
mu2 = np.repeat([0, 1], 5)
sigma = np.identity(10)
X = multivariate_normal(mean=mu1, cov=sigma).rvs(5)
mu
sigma
X

In [None]:
def simulate_clf_error(clf, train_sample_n=100, test_sample_n=2000):
    
    # Generate training sample and train classifier
    X_train_0 = multivariate_normal(mean=np.repeat(0, 10), cov=np.identity(10)).rvs(train_sample_n//2)
    X_train_1 = multivariate_normal(mean=np.repeat([0, 1], 5), cov=np.identity(10)).rvs(train_sample_n//2)
    
    X_train = np.r_[X_train_0, X_train_1]
    y_train = np.repeat([0, 1], train_sample_n//2)
        
    clf.fit(X_train, y_train)
        
    # Generate large set of test data and return error rate of classifier
    X_test_0 = multivariate_normal(mean=np.repeat(0, 10), cov=np.identity(10)).rvs(test_sample_n//2)
    X_test_1 = multivariate_normal(mean=np.repeat([0, 1], 5), cov=np.identity(10)).rvs(test_sample_n//2)
        
    X_test = np.r_[X_test_0, X_test_1]
    y_test = np.repeat([0, 1], test_sample_n//2)
        
    return 1 - clf.score(X_test, y_test)

#### Run simulations

In [None]:
repeats = 1000
svm_radial = [simulate_clf_error(SVC(kernel='rbf')) for i in np.arange(repeats)]
svm_linear = [simulate_clf_error(SVC(kernel='linear')) for i in np.arange(repeats)]
log_regr = [simulate_clf_error(LogisticRegression(C=100)) for i in np.arange(repeats)]

#### Average error rate

In [None]:
print('SVM - radial kernel: mean: {} sd: {}'.format(np.mean(svm_radial).round(3), np.var(svm_radial)**.5))
print('SVM - linear kernel: mean: {} sd: {}'.format(np.mean(svm_linear).round(3), np.var(svm_linear)**.5))
print('Logistic regression: mean: {} sd: {}'.format(np.mean(log_regr).round(3), np.var(log_regr)**.5))

### Visualise

In [None]:
plt.plot(svm_radial, 'g', alpha=0.4, label='SVM Radial')
plt.plot(svm_linear, 'r', alpha=0.4, label='SVM Linear')
plt.plot(log_regr, 'b', alpha=0.4, label='Logistic Regression')

plt.hlines(np.mean(np.c_[svm_radial, svm_linear, log_regr], axis=0), 0, repeats, colors=['g', 'r', 'b'])
plt.xlabel('Simulation')
plt.ylabel('Error rate')
plt.title('Simulation: expected misclassification rates')
plt.legend();

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3, sharey=True, figsize=(18,5))
_ = ax1.plot(svm_radial, 'g', alpha=0.4, label='SVM Radial')
_ = ax1.hlines(np.mean(svm_radial), 0, repeats, colors='g')
_ = ax1.set_ylabel('Error rate')

_ = ax2.plot(svm_linear, 'r', alpha=0.4, label='SVM Linear')
_ = ax2.hlines(np.mean(svm_linear), 0, repeats, colors='r')

_ = ax3.plot(log_regr, 'b', alpha=0.4, label='Logistic Regression')
_ = ax3.hlines(np.mean(log_regr), 0, repeats, colors='b');

_ = fig.suptitle('Simulation: expected misclassification rates', fontsize=16)
_ = fig.subplots_adjust(wspace=0.02)

for ax in fig.axes:
    _ = ax.set_xlabel('{} simulations'.format(repeats))
    _ = ax.tick_params(labelbottom='off')
    _ = ax.legend()