In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from emnist import extract_training_samples
from sklearn.utils import shuffle
from functools import partial
import time

from vertices_generator import vertices
from kernel import Gaussian_kernel
from mdwsvm import mdwsvm
from mdwsvm_ad import mdwsvm_ad
from one_class_svm import one_class_svm
from hybrid import hybrid
from metric import within_class_error

In [13]:
# Load data
digits_images, digits_labels = extract_training_samples('digits')
letters_images, letters_labels = extract_training_samples('byclass')

# Get number 
mask_1 = (digits_labels == 1)
digits_images_1 = digits_images[mask_1]
digits_labels_1 = digits_labels[mask_1]

mask_3 = (digits_labels == 3)
digits_images_3 = digits_images[mask_3]
digits_labels_3 = digits_labels[mask_3]

mask_5 = (digits_labels == 5)
digits_images_5 = digits_images[mask_5]
digits_labels_5 = digits_labels[mask_5]

mask_7 = (digits_labels == 7)
digits_images_7 = digits_images[mask_7]
digits_labels_7 = digits_labels[mask_7]


# Get letter u, v, w, x, y, z
mask_uvwxyz = (letters_labels == 56) | (letters_labels == 57) | (letters_labels == 58) | (letters_labels == 59) | (letters_labels == 60) | (letters_labels == 61)
letters_images = letters_images[mask_uvwxyz]
letters_labels = letters_labels[mask_uvwxyz]
print(len(letters_labels))

16349


In [14]:
# Get training and testing data
X_train = np.zeros((800,28,28))
y_train = np.zeros((800), dtype=int)
X_val = np.zeros((8000,28,28))
y_val = np.zeros((8000), dtype=int)
X_test = np.zeros((8000,28,28))
y_test = np.zeros((8000), dtype=int)

# 800 digits normalized training data 
X_train[0:150,:,:] = digits_images_1[0:150,:,:] / 255
X_train[150:300,:,:] = digits_images_3[0:150,:,:] / 255
X_train[300:550,:,:] = digits_images_5[0:250,:,:] / 255
X_train[550:800,:,:] = digits_images_7[0:250,:,:] / 255
X_train = X_train.reshape(800,784).T 
# 800 digits training label
y_train[0:150] = digits_labels_1[0:150] - 1
y_train[150:300] = digits_labels_3[0:150] - 2
y_train[300:550] = digits_labels_5[0:250] - 3
y_train[550:800] = digits_labels_7[0:250] - 4

# Used for hybrid
# Get 400 digits for validation X
X_val[0:100,:,:] = digits_images_1[1000:1100,:,:] / 255
X_val[100:200,:,:] = digits_images_3[1000:1100,:,:] / 255
X_val[200:300,:,:] = digits_images_5[1000:1100,:,:] / 255
X_val[300:400,:,:] = digits_images_7[1000:1100,:,:] / 255
# 400 digits validation label
y_val[0:100] = digits_labels_1[1000:1100] - 1
y_val[100:200] = digits_labels_3[1000:1100] - 2
y_val[200:300] = digits_labels_5[1000:1100] - 3
y_val[300:400] = digits_labels_7[1000:1100] - 4
# Get 7600 lowercase letters
X_val[400:8000,:,:] = letters_images[0:7600,:,:] / 255
y_val[400:8000] = letters_labels[0:7600]
# Get true y label to calculate hybrid error
y_val_true_hybrid = -np.ones((8000), dtype=int)
y_val_true_hybrid[0:400] = y_val[0:400]
# Get true y label to calculate mdwsvm_ad error
y_val_true_mdwsvm_ad = 4 * np.ones((8000), dtype=int)
y_val_true_mdwsvm_ad[0:400] = y_val[0:400]
# 400 digits and 7600 letters normalized data
X_val = X_val.reshape(8000,784).T

# Get 400 digits for test X
X_test[0:100,:,:] = digits_images_1[1100:1200,:,:] / 255
X_test[100:200,:,:] = digits_images_3[1100:1200,:,:] / 255
X_test[200:300,:,:] = digits_images_5[1100:1200,:,:] / 255
X_test[300:400,:,:] = digits_images_7[1100:1200,:,:] / 255
# 400 digits test label
y_test[0:100] = digits_labels_1[1100:1200] - 1
y_test[100:200] = digits_labels_3[1100:1200] - 2
y_test[200:300] = digits_labels_5[1100:1200] - 3
y_test[300:400] = digits_labels_7[1100:1200] - 4
# Get 7600 lowercase letters
X_test[400:8000,:,:] = letters_images[8000:15600,:,:] / 255
y_test[400:8000] = letters_labels[8000:15600]
# Get true y label to calculate hybrid error
y_test_true_hybrid = -np.ones((8000), dtype=int)
y_test_true_hybrid[0:400] = y_test[0:400]
# Get true y label to calculate mdwsvm_ad error
y_test_true_mdwsvm_ad = 4 * np.ones((8000), dtype=int)
y_test_true_mdwsvm_ad[0:400] = y_test[0:400]
# 400 digits and 7600 letters normalized data
X_test = X_test.reshape(8000,784).T

# y_test: 0,1,2,3,56-61
# y_test_true_hybrid: -1,0,1,2,3
# y_test_true_mdwsvm_ad: 0,1,2,3,4

In [15]:
# MDWSVM
w1 = vertices(4)
best_c = 1
model1 = mdwsvm(X_train, y_train, w1, best_c)
y_pred_1 = model1.predict(X_test)
print('The error is', within_class_error(y_test, y_pred_1))

The error is 0.636


In [16]:
result_1 = pd.crosstab(y_test, y_pred_1, rownames=['True label'], colnames=['Predicted label'])
result_1.rename(index={56:'u', 57:'v', 58:'w', 59:'x', 60:'y', 61:'z'}, inplace=True)
result_1 = result_1.div(result_1.sum(axis=1), axis=0)
result_1.applymap(lambda x: '{:.2%}'.format(x))

Predicted label,0,1,2,3
True label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,94.00%,0.00%,5.00%,1.00%
1,0.00%,90.00%,6.00%,4.00%
2,2.00%,16.00%,81.00%,1.00%
3,1.00%,0.00%,0.00%,99.00%
u,5.97%,1.57%,77.74%,14.71%
v,7.99%,0.98%,56.44%,34.59%
w,12.99%,0.00%,86.61%,0.40%
x,18.78%,6.69%,61.90%,12.62%
y,22.79%,8.86%,25.09%,43.27%
z,9.53%,14.92%,54.14%,21.41%


In [17]:
# Hybrid
best_sigma2 = 0.09
best_v = 0.3
best_c = 1
best_k = partial(Gaussian_kernel, sigma2=best_sigma2)
y_pred_2 = hybrid(X_train, y_train, X_test, best_v, w1, best_c, best_k)
print('The error is', within_class_error(y_test_true_hybrid, y_pred_2))

The error is 0.43607894736842107


In [18]:
true_label_2 = [0, 1, 2, 3, 'u', 'v', 'w', 'x', 'y', 'z']
result_2 = pd.crosstab(y_test, y_pred_2, rownames=['True label'], colnames=['Predicted label'])
result_2.rename(index={56:'u', 57:'v', 58:'w', 59:'x', 60:'y', 61:'z'}, inplace=True)
result_2 = result_2.div(result_2.sum(axis=1), axis=0)
result_2.applymap(lambda x: '{:.2%}'.format(x))

Predicted label,-1,0,1,2,3
True label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,41.00%,55.00%,0.00%,3.00%,1.00%
1,38.00%,0.00%,56.00%,3.00%,3.00%
2,35.00%,2.00%,8.00%,54.00%,1.00%
3,24.00%,1.00%,0.00%,0.00%,75.00%
u,42.72%,3.81%,0.67%,44.73%,8.07%
v,40.32%,3.54%,0.68%,34.36%,21.10%
w,40.08%,8.37%,0.00%,51.24%,0.32%
x,41.06%,12.40%,3.57%,34.90%,8.06%
y,41.70%,12.82%,5.44%,14.21%,25.83%
z,45.86%,5.62%,7.03%,29.77%,11.72%


In [19]:
# MDWSVM_ad
# v=0.100 sigma2=12.000 c=1.000 score:0.8743947368421052
best_v_2 = 0.1
best_sigma2_2 = 12
best_c_2 = 1
w2 = vertices(5)
best_k_2 = partial(Gaussian_kernel, sigma2=best_sigma2_2)
model3 = mdwsvm_ad(X_train, y_train, w2, best_c_2, best_v_2, best_k_2)
y_pred_3 = model3.predict(X_test)
print('The error is', within_class_error(y_test_true_mdwsvm_ad, y_pred_3))

The error is 0.13728947368421052


In [20]:
true_label_3 = [0, 1, 2, 3, 'u', 'v', 'w', 'x', 'y', 'z']
result_3 = pd.crosstab(y_test, y_pred_3, rownames=['True label'], colnames=['Predicted label'])
result_3.rename(index={56:'u', 57:'v', 58:'w', 59:'x', 60:'y', 61:'z'}, inplace=True)
result_3 = result_3.div(result_3.sum(axis=1), axis=0)
result_3.applymap(lambda x: '{:.2%}'.format(x))

Predicted label,0,1,2,3,4
True label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,96.00%,0.00%,3.00%,1.00%,0.00%
1,0.00%,75.00%,6.00%,2.00%,17.00%
2,0.00%,3.00%,87.00%,0.00%,10.00%
3,0.00%,0.00%,0.00%,93.00%,7.00%
u,0.00%,0.07%,3.21%,0.30%,96.42%
v,0.45%,0.00%,11.53%,7.01%,81.01%
w,0.00%,0.00%,2.71%,0.00%,97.29%
x,1.06%,0.15%,20.84%,3.73%,74.22%
y,6.55%,1.75%,13.01%,36.53%,42.16%
z,0.08%,2.27%,9.06%,3.67%,84.92%
