GTI770 - Systèmes Intelligents et Apprentissage Machine

Alessandro L. Koerich

Notebook Jupyter - 11_Voting_UppercaseHandwriting_26Classes

July 2018

In [1]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [3]:
# Load data from file
# NIST Train 26 Classes Uppercase Handwritten Characters
# 37,440 samples for training
# 12,092 samples for validation
# 11,941 samples for testing
# 108-dimensional feature vectors
# 26 classes (A-Z uppercase characters)

TrainData = np.loadtxt('CSV_Files/NISTUpperHandwritten_train.csv', delimiter=' ', dtype=np.str)
ValidData = np.loadtxt('CSV_Files/NISTUpperHandwritten_valid.csv', delimiter=' ', dtype=np.str)
TestData  = np.loadtxt('CSV_Files/NISTUpperHandwritten_test.csv' , delimiter=' ', dtype=np.str)

Xtrain =TrainData[0:37439,0:108].astype(np.float)
Ytrain =TrainData[0:37439,108:134].astype(np.int)

Xvalid = ValidData[0:12091,0:108].astype(np.float)
Yvalid = ValidData[0:12091,108:134].astype(np.int)

Xtest  = TestData[0:11940,0:108].astype(np.float)
Ytest  = TestData[0:11940,108:134].astype(np.int)

In [4]:
from numpy import argmax
Ytrain2 = argmax(Ytrain, axis=1)
Yvalid2 = argmax(Yvalid, axis=1)
Ytest2  = argmax(Ytest , axis=1)

In [5]:
# normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))

Xtrain = scaler.fit_transform(Xtrain)
Xvalid = scaler.fit_transform(Xvalid)
Xtest  = scaler.fit_transform(Xtest)

In [6]:
num_classes = Ytrain.shape[1]
input_dim   = Xtrain.shape[1]

In [7]:
# Let's define different learning algorithm models
from sklearn.tree import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn import svm

def DT_model():
    print("Decision Tree\n")
    # create model
    model = tree.DecisionTreeClassifier(criterion='entropy', 
                                        max_depth=10, min_samples_leaf=10, 
                                        min_samples_split=20 )
    return model


def NB_model():
    print("Naive Bayes Normal Distribution\n")
    model = GaussianNB()
    
    return model


def KNN_model():
    print("1-NN\n")
    n_neighbors = 3
    weights = 'uniform'
    metric = 'euclidean'
    algorithm = 'kd_tree'

    model = neighbors.KNeighborsClassifier(n_neighbors, 
                                               weights=weights, 
                                               algorithm=algorithm, 
                                               metric=metric )
    return model


def linearSVM_model():
    print("SVM with Linear Kernel\n")
    # create model
    model = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=2, gamma='auto', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

    return model

In [8]:
# Build the four models

model_DT  = DT_model()
model_NB  = NB_model()
model_KNN = KNN_model()
model_SVM = linearSVM_model()

Decision Tree

Naive Bayes Normal Distribution

1-NN

SVM with Linear Kernel



In [9]:
from sklearn.ensemble import VotingClassifier

# Building the Ensemble 
# Combination with MAJORITY VOTE (labels only)

def ENSEMBLE_model_3():
    print("Majority Vote of Decision Tree, Naive Bayes and SVM\n")
    model = VotingClassifier(estimators = [ ('DT', model_DT), ('NB', model_NB) , ('SVM', model_SVM) ], 
                             voting = 'hard',
                             n_jobs = 3) 
    return model


def ENSEMBLE_model_2():
    print("Majority Vote of Decision Tree and Naive Bayes\n")
    model = VotingClassifier(estimators = [ ('DT', model_DT), ('NB', model_NB)], 
                             voting = 'hard',
                             n_jobs = 2) 
    return model

In [10]:
# Build the ENSEMBLE model

model_ENSEMBLE  = ENSEMBLE_model_2()
# model_ENSEMBLE  = ENSEMBLE_model_3()

Majority Vote of Decision Tree and Naive Bayes



In [11]:
# Fit the models (TRAIN)
model_DT.fit(Xtrain, Ytrain2)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
# Fit the models (TRAIN)
model_NB.fit(Xtrain, Ytrain2)

GaussianNB(priors=None, var_smoothing=1e-09)

In [13]:
# Fit the models (TRAIN)
model_KNN.fit(Xtrain, Ytrain2)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [14]:
# Fit the models (TRAIN)
model_SVM.fit(Xtrain, Ytrain2)

[LibSVM]

KeyboardInterrupt: 

In [15]:
# Fit the Ensemble (TRAIN)
model_ENSEMBLE.fit(Xtrain, Ytrain2)

VotingClassifier(estimators=[('DT', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')), ('NB', GaussianNB(priors=None, var_smoothing=1e-09))],
         flatten_transform=None, n_jobs=2, voting='hard', weights=None)

In [16]:
# Use the model to predict the class of samples
# Notice that we are testing on the 3 data splits

Ytrain_pred_DT = model_DT.predict(Xtrain)
Yvalid_pred_DT = model_DT.predict(Xvalid)
Ytest_pred_DT  = model_DT.predict(Xtest)

Ytrain_pred_NB = model_NB.predict(Xtrain)
Yvalid_pred_NB = model_NB.predict(Xvalid)
Ytest_pred_NB  = model_NB.predict(Xtest)

# Ytrain_pred_KNN = model_KNN.predict(Xtrain)
# Yvalid_pred_KNN = model_KNN.predict(Xvalid)
# Ytest_pred_KNN  = model_KNN.predict(Xtest)

# Ytrain_pred_SVM = model_SVM.predict(Xtrain)
# Yvalid_pred_SVM = model_SVM.predict(Xvalid)
# Ytest_pred_SVM  = model_SVM.predict(Xtest)

In [17]:
# Use the ENSEMBBLE model to predict the class of samples
# Notice that we are testing on the 3 data splits

Ytrain_pred_ENSEMBLE = model_ENSEMBLE.predict(Xtrain)
Yvalid_pred_ENSEMBLE = model_ENSEMBLE.predict(Xvalid)
Ytest_pred_ENSEMBLE  = model_ENSEMBLE.predict(Xtest)

In [18]:
# Evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Final evaluation of the model (On the Training, Validation or Test dataset)

# Decision Tree
scores_tr_DT  = accuracy_score(Ytrain2, Ytrain_pred_DT )
scores_val_DT = accuracy_score(Yvalid2, Yvalid_pred_DT )
scores_tst_DT = accuracy_score(Ytest2,  Ytest_pred_DT )

print("Classification rate for the training dataset (Decision Tree)     = "+str(scores_tr_DT*100)+"%")
print("Classification rate for the validation dataset  (Decision Tree)  = "+str(scores_val_DT*100)+"%")
print("Classification rate for the test dataset (Decision Tree)         = "+str(scores_tst_DT*100)+"%")

# Naive Bayes Gaussian
scores_tr_NB  = accuracy_score(Ytrain2, Ytrain_pred_DT )
scores_val_NB = accuracy_score(Yvalid2, Yvalid_pred_NB )
scores_tst_NB = accuracy_score(Ytest2,  Ytest_pred_NB )

print("Classification rate for the training dataset (Naive Bayes)     = "+str(scores_tr_NB*100)+"%")
print("Classification rate for the validation dataset  (Naive Bayes)  = "+str(scores_val_NB*100)+"%")
print("Classification rate for the test dataset (Naive Bayes)         = "+str(scores_tst_NB*100)+"%")

# K-NN
# scores_tr_KNN  = accuracy_score(Ytrain2, Ytrain_pred_KNN )
# scores_val_KNN = accuracy_score(Yvalid2, Yvalid_pred_KNN )
# scores_tst_KNN = accuracy_score(Ytest2,  Ytest_pred_KNN )

# print("Correct classification rate for the training dataset (k-NN)     = "+str(scores_tr_KNN*100)+"%")
# print("Correct classification rate for the validation dataset  (k-NN)  = "+str(scores_val_KNN*100)+"%")
# print("Correct classification rate for the test dataset (k-NN)         = "+str(scores_tst_KNN*100)+"%")

# SVM
# scores_tr_SVM  = accuracy_score(Ytrain2, Ytrain_pred_SVM )
# scores_val_SVM = accuracy_score(Yvalid2, Yvalid_pred_SVM )
# scores_tst_SVM = accuracy_score(Ytest2,  Ytest_pred_SVM )

# print("Correct classification rate for the training dataset (SVM)     = "+str(scores_tr_SVM*100)+"%")
# print("Correct classification rate for the validation dataset  (SVM)  = "+str(scores_val_SVM*100)+"%")
# print("Correct classification rate for the test dataset (SVM)         = "+str(scores_tst_SVM*100)+"%")

# ENSEMBLE
scores_tr_ENSEMBLE  = accuracy_score(Ytrain2, Ytrain_pred_ENSEMBLE )
scores_val_ENSEMBLE = accuracy_score(Yvalid2, Yvalid_pred_ENSEMBLE )
scores_tst_ENSEMBLE = accuracy_score(Ytest2,  Ytest_pred_ENSEMBLE )

print("Classification rate for the training dataset (Ensemble)     = "+str(scores_tr_ENSEMBLE*100)+"%")
print("Classification rate for the validation dataset  (Ensemble)  = "+str(scores_val_ENSEMBLE*100)+"%")
print("Classification rate for the test dataset (Ensemble)         = "+str(scores_tst_ENSEMBLE*100)+"%")

Classification rate for the training dataset (Decision Tree)     = 83.8617484441358%
Classification rate for the validation dataset  (Decision Tree)  = 77.85956496567695%
Classification rate for the test dataset (Decision Tree)         = 74.69011725293132%
Classification rate for the training dataset (Naive Bayes)     = 83.8617484441358%
Classification rate for the validation dataset  (Naive Bayes)  = 83.16929947895129%
Classification rate for the test dataset (Naive Bayes)         = 81.35678391959799%
Classification rate for the training dataset (Ensemble)     = 84.48676513795775%
Classification rate for the validation dataset  (Ensemble)  = 80.20841948556777%
Classification rate for the test dataset (Ensemble)         = 78.69346733668343%


In [17]:
from sklearn.ensemble import VotingClassifier

# Building the Ensemble 
# Combination with Weighted Average Probabilities (Soft Voting)¶

def ENSEMBLE_model_3s():
    
    model = VotingClassifier(estimators = [ ('DT', model_DT), ('NB', model_NB) , ('SVM', model_SVM) ], 
                             voting = 'soft',
                             n_jobs = 3,
                             weights=[0.05,0.15,0.8]) 
    return model

def ENSEMBLE_model_2s():
    
    model = VotingClassifier(estimators = [ ('DT', model_DT), ('NB', model_NB)], 
                             voting = 'soft',
                             n_jobs = 2,
                             weights=[0.5,0.5]) 
    return model

In [18]:
# Build the ENSEMBLE model

model_ENSEMBLE  = ENSEMBLE_model_2s()

In [19]:
# Fit the Ensemble (TRAIN)
model_ENSEMBLE.fit(Xtrain, Ytrain2)

VotingClassifier(estimators=[('DT', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')), ('NB', GaussianNB(priors=None))],
         flatten_transform=None, n_jobs=2, voting='soft',
         weights=[0.5, 0.5])

In [20]:
Ytrain_pred_ENSEMBLE = model_ENSEMBLE.predict(Xtrain)
Yvalid_pred_ENSEMBLE = model_ENSEMBLE.predict(Xvalid)
Ytest_pred_ENSEMBLE  = model_ENSEMBLE.predict(Xtest)

  if diff:
  if diff:
  if diff:


In [21]:
# ENSEMBLE
scores_tr_ENSEMBLE  = accuracy_score(Ytrain2, Ytrain_pred_ENSEMBLE )
scores_val_ENSEMBLE = accuracy_score(Yvalid2, Yvalid_pred_ENSEMBLE )
scores_tst_ENSEMBLE = accuracy_score(Ytest2,  Ytest_pred_ENSEMBLE )

print("Classification rate for the training dataset (Ensemble)     = "+str(scores_tr_ENSEMBLE*100)+"%")
print("Classification rate for the validation dataset  (Ensemble)  = "+str(scores_val_ENSEMBLE*100)+"%")
print("Classification rate for the test dataset (Ensemble)         = "+str(scores_tst_ENSEMBLE*100)+"%")

Classification rate for the training dataset (Ensemble)     = 88.84051390261492%
Classification rate for the validation dataset  (Ensemble)  = 84.50913902902985%
Classification rate for the test dataset (Ensemble)         = 82.6214405360134%


In [None]:
print("Notebook ended")