GTI770 - Systèmes Intelligents et Apprentissage Machine

Alessandro L. Koerich

Notebook Jupyter - 10_SVM_UpperLowercaseHandwriting_52Classes

July 2018

In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn import svm
import numpy as np

In [2]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [3]:
# Load data from file
# NIST Train 52 Classes Uppercase + Lowecase Handwritten Characters
# 74,880 samples for training
# 23,670 samples for validation
# 23,941 samples for testing
# 108-dimensional feature vectors
# 26 classes (A-Z uppercase characters) + 26 classes (a-z lowercase characters) 

TrainData = np.loadtxt('CSV_Files/Char_UpperLower52.train.csv', delimiter=' ', dtype=np.str)
ValidData = np.loadtxt('CSV_Files/Char_UpperLower52.val.csv', delimiter=' ', dtype=np.str)
TestData  = np.loadtxt('CSV_Files/Char_UpperLower52.test.csv' , delimiter=' ', dtype=np.str)


Xtrain =TrainData[0:74779,0:108].astype(np.float)
Ytrain =TrainData[0:74779,108:160].astype(np.int)

Xvalid = ValidData[0:23669,0:108].astype(np.float)
Yvalid = ValidData[0:23669,108:160].astype(np.int)

Xtest  = TestData[0:23940,0:108].astype(np.float)
Ytest  = TestData[0:23940,108:160].astype(np.int)

In [4]:
Xtrain

array([[0.324913, 0.193763, 0.115073, ..., 0.      , 0.168941, 0.270305],
       [0.226327, 0.083919, 0.055946, ..., 0.095636, 0.095636, 0.047818],
       [0.181888, 0.186941, 0.19452 , ..., 0.      , 0.108508, 0.018085],
       ...,
       [0.354641, 0.25424 , 0.143044, ..., 0.045833, 0.068749, 0.      ],
       [0.381843, 0.251339, 0.180449, ..., 0.      , 0.205226, 0.      ],
       [0.402374, 0.200176, 0.114579, ..., 0.      , 0.156343, 0.      ]])

In [5]:
Ytrain

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [6]:
from numpy import argmax
Ytrain2 = argmax(Ytrain, axis=1)
Yvalid2 = argmax(Yvalid, axis=1)
Ytest2  = argmax(Ytest, axis=1)

In [7]:
# normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
Xtrain = scaler.fit_transform(Xtrain)
Xvalid = scaler.fit_transform(Xvalid)
Xtest  = scaler.fit_transform(Xtest)

In [8]:
Ytrain2

array([26, 39, 22, ...,  6, 40,  6])

In [9]:
num_classes = Ytrain.shape[1]
input_dim   = Xtrain.shape[1]

In [10]:
input_dim

108

In [11]:
num_classes

52

In [12]:
def linearSVM_model():
    print("SVM with Linear Kernel\n")
    # create model
    model = svm.SVC(C=1.0, cache_size=2000, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

    return model

In [13]:
# Build the model
# Choose one at each time
model = linearSVM_model()

SVM with Linear Kernel



In [None]:
# Fit the model (TRAIN) 
model.fit(Xtrain, Ytrain2)

In [None]:
# Use the model to predict the class of samples
# Notice that we are testing the train dataset
Ytrain_pred = model.predict(Xtrain)
Ytrain_pred

Yvalid_pred = model.predict(Xvalid)
Yvalid_pred

Ytest_pred = model.predict(Xtest)
Ytest_pred

In [None]:
# You can also predict the probability of each class
# train dataset
Ytrain_pred_prob = model.predict_proba(Xtrain)
Ytrain_pred_prob

Yvalid_pred_prob = model.predict_proba(Xvalid)
Yvalid_pred_prob

Ytest_pred_prob = model.predict_proba(Xtest)
Ytest_pred_prob

In [None]:
# Evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Final evaluation of the model (On the Training, Validation or Test dataset)
scores = accuracy_score(Ytrain2, Ytrain_pred )
print("Correct classification rate for the training dataset = "+str(scores*100)+"%")

scores2 = accuracy_score(Yvalid2, Yvalid_pred )
print("Correct classification rate for the validation dataset = "+str(scores2*100)+"%")

scores3 = accuracy_score(Ytest2, Ytest_pred )
print("Correct classification rate for the test dataset = "+str(scores3*100)+"%")

In [None]:
from sklearn.metrics import classification_report
target_names = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z', 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
print( classification_report(Yvalid2, Yvalid_pred, target_names=target_names))
# This works, but we have labels with no predicted samples

In [None]:
# Predict and show the confusion matrix (For the Validation dataset)
cm = confusion_matrix(Yvalid2, Yvalid_pred )

In [None]:
import itertools
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap, aspect = 'auto')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    #plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure(figsize = (20,20))
plot_confusion_matrix(cm, classes = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z', 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
, title='Confusion matrix, without normalization')

In [None]:
plt.show()

In [None]:
# Plot normalized confusion matrix
plt.figure(figsize = (28,28))
plot_confusion_matrix(cm, classes = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z', 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
, normalize=True, title='Confusion matrix, with normalization')

In [None]:
plt.show()

- HYPERPARAMETER OPTIMIZATION

OK, but we didn't optimize the parameters of the SVM, such as:

1) Kernel

2) Cost 

3) Kernel parameters (gamma)

But now, we already have a pre-defined VALIDATION dataset! So, we don't need to split the dataset and use cross-validation.

We will use the hypopt Python package (pip install hypopt). It's a professional package created specifically for parameter optimization with a validation set. It works with any scikit-learn model out-of-the-box and can be used with Tensorflow, PyTorch, etc. as well.

https://pypi.org/project/hypopt/1.0.0/

BE CAREFUL! With such amount of data, it will take tens of minutes to find the best parameters...

Actually, it took 1h30min in my iMac 3.2GHz Core i5.

In [24]:
# Set the parameters by cross-validation
# Assuming you already have train, test, val sets and a model.
from hypopt import GridSearch
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3], 'C': [100, 1000]},
                    {'kernel': ['linear'], 'C': [100, 1000]}]


In [27]:
# Grid-search all parameter combinations using a validation set.
tuned_model = GridSearch(model = svm.SVC(probability=True, verbose=True), param_grid=param_grid)
tuned_model.fit(Xtrain, Ytrain2, Xvalid, Yvalid2)

print("Best parameters set found on validation set:")
print()
print('Test Score for Optimized Parameters:', tuned_model.score(Xvalid, Yvalid2))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Best parameters set found on validation set:

Test Score for Optimized Parameters: 0.7293083780472348


In [None]:
print('We can view the best performing parameters and their scores.')
for z in tuned_model.get_param_scores()[:2]:
    p, s = z
    print(p)
    print('Score:', s)
print()
print('Verify that the lowest scoring parameters make sense.')
for z in tuned_model.get_param_scores()[-2:]:
    p, s = z
    print(p)
    print('Score:', s)

In [None]:
# Use the model to predict the class of samples
# Notice that we are testing the train dataset
Ytrain_pred = tuned_model.predict(Xtrain)
Ytrain_pred

Yvalid_pred = tuned_model.predict(Xvalid)
Yvalid_pred

Ytest_pred = tuned_model.predict(Xtest)
Ytest_pred

In [None]:
# You can also predict the probability of each class
# train dataset
Ytrain_pred_prob = tuned_model.predict_proba(Xtrain)
Ytrain_pred_prob

Yvalid_pred_prob = tuned_model.predict_proba(Xvalid)
Yvalid_pred_prob

Ytest_pred_prob = tuned_model.predict_proba(Xtest)
Ytest_pred_prob

In [None]:
# Final evaluation of the model (On the Training, Validation or Test dataset)
scores_tuned = accuracy_score(Ytrain2, Ytrain_pred )
print("Correct classification rate for the training dataset (first model) = "+str(scores*100)+"%")
print("Correct classification rate for the training dataset (best model) = "+str(scores_tuned*100)+"%")
print()
scores_tuned = accuracy_score(Yvalid2, Yvalid_pred )
print("Correct classification rate for the validation dataset (first model) = "+str(scores2*100)+"%")
print("Correct classification rate for the validation dataset (best model) = "+str(scores_tuned*100)+"%")
print()
scores_tuned = accuracy_score(Ytest2, Ytest_pred )
print("Correct classification rate for the test dataset (first model) = "+str(scores3*100)+"%")
print("Correct classification rate for the test dataset (best model) = "+str(scores_tuned*100)+"%")

In [None]:
# Predict and show the confusion matrix (For the Validation dataset)
cm = confusion_matrix(Yvalid2, Yvalid_pred )
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure(figsize = (28,28))
plot_confusion_matrix(cm, classes = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z', 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
, title='Confusion matrix, without normalization')

In [None]:
plt.show()

In [None]:
# Plot normalized confusion matrix
plt.figure(figsize = (28,28))
plot_confusion_matrix(cm, classes = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z', 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
, normalize=True, title='Confusion matrix, with normalization')

In [None]:
plt.show()

In [None]:
print("Notebook ended")