<a href="https://colab.research.google.com/github/bgalerne/M1MAS_Stat_Images/blob/master/TP_SVM_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Lab session on Support Vector Machines (SVN) for supervised classification

#Outline:
1. Linear and non linear SVN classifiers for 2D data. Important parameters

2. SVN classifier for image classification:
  - Handwritten digits
  - CIFAR-10 database


# 1. Linear and non linear SVN classifiers for 2D data
Most of the code is from scikit-learn's [Classifier comparison](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html).

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [0]:
# create some toy datasets :

#
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
lsds = (X, y)
rng = np.random.RandomState(2)
X2 = X.copy()
X2 += 2 * rng.uniform(size=X2.shape)
nslsds = (X2,y.copy())

datasets = [lsds,
            nslsds,
            make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1)
            ]

# split each datasets into training and testing sets:
datasets_train = []
datasets_test = []
for ds_cnt, ds in enumerate(datasets):
  # preprocess dataset, split into training and test part
  X, y = ds
  X = StandardScaler().fit_transform(X)
  X_train, X_test, y_train, y_test = \
      train_test_split(X, y, test_size=.4, random_state=42)
  datasets_train.append((X_train, y_train))
  datasets_test.append((X_test, y_test))


In [0]:
# plot each dataset:
h = .02  # step size in the mesh
figure = plt.figure(figsize=(3, 12))
i=1
for ds_cnt, ds in enumerate(datasets_train):
  X_train, y_train = ds
  X_test, y_test = datasets_test[ds_cnt]
  X = np.concatenate((X_train,X_test)) # all data
  x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
  y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
  xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                        np.arange(y_min, y_max, h))
  # just plot the dataset first
  cm = plt.cm.RdBu
  cm_bright = ListedColormap(['#FF0000', '#0000FF'])
  ax = plt.subplot(len(datasets), 1, i)
  if ds_cnt == 0:
      ax.set_title("Input data")
  # Plot the training points
  ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
              edgecolors='k')
  # Plot the testing points
  ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
              edgecolors='k')
  ax.set_xlim(xx.min(), xx.max())
  ax.set_ylim(yy.min(), yy.max())
  ax.set_xticks(())
  ax.set_yticks(())
  i+=1

plt.tight_layout()
plt.show()

#Question:

Which dataset is linearly separable?

#Linear SVM VS Kernel SVM
For each dataset we fit a linear SVM and a kernel SVM using the Gaussian radial basis function

$$
k(x,x') = e^{-\gamma \|x-x'\|}.
$$

For each classifier, the accuracy is displayed in the plot. Recall that the accuray is the ratio of well-classified points among all points:
$$
\text{Accuracy} = \frac{\text{TP}+\text{TN}}{\text{TP}+\text{FP}+\text{TN}+ \text{FN}}
$$



In [0]:
# Define two classifiers
names = ["Linear SVM", "RBF SVM"]
classifiers = [SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1)]

figure = plt.figure(figsize=(9, 9))
i = 1
# iterate over datasets: for each dataset: train each classifier + plot result
for ds_cnt, ds in enumerate(datasets_train):
  X_train, y_train = ds
  X_test, y_test = datasets_test[ds_cnt]
  X = np.concatenate((X_train,X_test)) # all data

  x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
  y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
  xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                        np.arange(y_min, y_max, h))

  # just plot the dataset first
  cm = plt.cm.RdBu
  cm_bright = ListedColormap(['#FF0000', '#0000FF'])
  ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
  if ds_cnt == 0:
      ax.set_title("Input data")
  # Plot the training points
  ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
              edgecolors='k')
  # Plot the testing points
  ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
              edgecolors='k')
  ax.set_xlim(xx.min(), xx.max())
  ax.set_ylim(yy.min(), yy.max())
  ax.set_xticks(())
  ax.set_yticks(())
  i += 1

  # iterate over classifiers
  for name, clf in zip(names, classifiers):
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

    # train classifier:
    clf.fit(X_train, y_train)

    # evaluate classifier accuracy:
    score = clf.score(X_test, y_test)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                edgecolors='k', alpha=0.6)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    if ds_cnt == 0:
      ax.set_title(name)
    ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
            size=15, horizontalalignment='right')
    i += 1

plt.tight_layout()
plt.show()






##Exercice

Define two additional classifiers using the Gaussian RBF kernel with other $\gamma$ values. 
Chose extreme values of $\gamma$ for which:
 - The first new SVM classifier is close to a linear classifier
 - The second new SVM classifier suffers from overfitting

# 2. Recognizing hand-written digits using a SVM
Most of the code is from scikit-learn's [Recognizing hand-written digits](https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html).

In [0]:
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split


In [0]:

# The digits dataset
digits = datasets.load_digits()

# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 30 images, stored in the `images` attribute of the
# dataset.  If we were working from image files, we could load them using
# matplotlib.pyplot.imread.  Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
_, axes = plt.subplots(3, 10,figsize=(20, 6))
images_and_labels = list(zip(digits.images, digits.target))
for ax, (image, label) in zip(axes[0,:], images_and_labels[:10]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Training: %i' % label)
for ax, (image, label) in zip(axes[1,:], images_and_labels[10:20]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Training: %i' % label)
for ax, (image, label) in zip(axes[2,:], images_and_labels[20:30]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Training: %i' % label)


plt.show()

In [0]:
# Data processing:
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
    data, digits.target, test_size=0.5, shuffle=False)


In [0]:
# Classifier training:
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)

# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)

# Now predict the value of the digit on the second half:
predicted = classifier.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(y_test, predicted)))
disp = metrics.plot_confusion_matrix(classifier, X_test, y_test)
disp.figure_.suptitle("Confusion Matrix")
print("Confusion matrix:\n%s" % disp.confusion_matrix)

plt.show()

In [0]:
#plot some prediction for test images:

_, axes = plt.subplots(3, 10,figsize=(20, 6))
images_labels_and_predictions = list(zip(digits.images[n_samples // 2:],digits.target[n_samples // 2:], predicted))
for ax, (image, label,pred) in zip(axes[0,:], images_labels_and_predictions[:10]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Test: %i / Pred: %i' % (label, pred))
for ax, (image, label,pred) in zip(axes[1,:], images_labels_and_predictions[10:20]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Test: %i / Pred: %i' % (label, pred))
for ax, (image, label,pred) in zip(axes[2,:], images_labels_and_predictions[30:40]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Test: %i / Pred: %i' % (label, pred))


plt.show()


#Exercise: 
1. Display 10 images that are missclassified.
2. Is the performance sensitive to the parameter values?