# Thư viện:

In [1]:
from sklearn.svm import SVC
import numpy as np
import pickle
import gzip
import pandas as pd
import time
from sklearn.metrics import accuracy_score

# Đọc dữ liệu:

In [2]:
def read_mnist(mnist_file):
    """
    Reads MNIST data.
    
    Parameters
    ----------
    mnist_file : string
        The name of the MNIST file (e.g., 'mnist.pkl.gz').
    
    Returns
    -------
    (train_X, train_Y, val_X, val_Y, test_X, test_Y) : tuple
        train_X : numpy array, shape (N=50000, d=784)
            Input vectors of the training set.
        train_Y: numpy array, shape (N=50000)
            Outputs of the training set.
        val_X : numpy array, shape (N=10000, d=784)
            Input vectors of the validation set.
        val_Y: numpy array, shape (N=10000)
            Outputs of the validation set.
        test_X : numpy array, shape (N=10000, d=784)
            Input vectors of the test set.
        test_Y: numpy array, shape (N=10000)
            Outputs of the test set.
    """
    f = gzip.open(mnist_file, 'rb')
    train_data, val_data, test_data = pickle.load(f, encoding='latin1')
    f.close()
    
    train_X, train_Y = train_data
    val_X, val_Y = val_data
    test_X, test_Y = test_data    
    
    return train_X, train_Y, val_X, val_Y, test_X, test_Y

In [3]:
# Test
train_X, train_Y, val_X, val_Y, test_X, test_Y = read_mnist('mnist.pkl.gz')

print('train_X.shape =', train_X.shape)
print('train_Y.shape =', train_Y.shape)
print('val_X.shape   =', val_X.shape)
print('val_Y.shape   =', val_Y.shape)
print('test_X.shape  =', test_X.shape)
print('test_Y.shape  =', test_Y.shape)

print('\ntrain_X: min = %.3f, max = %.3f' %(train_X.min(), train_X.max()))
print('train_Y: min = %d, max = %d' %(train_Y.min(), train_Y.max()))

train_X.shape = (50000, 784)
train_Y.shape = (50000,)
val_X.shape   = (10000, 784)
val_Y.shape   = (10000,)
test_X.shape  = (10000, 784)
test_Y.shape  = (10000,)

train_X: min = 0.000, max = 0.996
train_Y: min = 0, max = 9


## Hiển thị cấu trúc của tập huấn luyện:

In [4]:
train_data_X = pd.DataFrame(train_X)
train_data_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train_data_X.describe().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000739,0.000354,0.000204,9e-05,7.1e-05,9e-06,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022778,0.015422,0.012079,0.007217,0.007181,0.001483,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Chạy thuật toán:

## Linear Kernel

Mục tiêu: Set các giá trị siêu tham số khác nhau cho mỗi lần huấn luyện, đưa ra kết luận về sự ảnh hường của việc chọn tham số.

Dựa trên : https://queirozf.com/entries/choosing-c-hyperparameter-for-svm-classifiers-examples-with-scikit-learn

Độ lỗi: https://stackoverflow.com/questions/10318884/find-out-error-rate-using-sklearn

In [None]:
list_error = []
for k in range(-3,3):
    t = 10**k
    start_time = time.perf_counter()
    svclassifier = SVC(C = t,kernel='linear')
    svclassifier.fit(train_X, train_Y)
    train_time = time.perf_counter() - start_time
    y_train_pred = svclassifier.predict(train_X)  
    y_val_pred = svclassifier.predict(val_X)
    train_error = 1 -  (accuracy_score(train_Y, y_pred, normalize=False) / float(train_Y.size))
    val_error = 1 -  (accuracy_score(val_Y, y_pred, normalize=False) / float(val_Y.size))
    print("C: {},train error: {}, validation error: {}, time: {}".format(t, train_error, val_error, train_time))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(test_Y, y_pred))
print(classification_report(test_Y, y_pred))

### Cải thiện tốc độ chạy:

Trước tiên, ta đưa ra trường hợp ví dụ để đo thời gian tiêu tốn theo từng bước của chương trình. 

Cụ thể, xét tại giá trị $C = 1$, ta có các mốc thời gian tiêu tốn lần lượt như sau:

In [7]:
k = 0
t = 10**k

#Time 1
start_time1 = time.perf_counter()
svclassifier = SVC(C = 10**k,kernel='linear')
svclassifier.fit(train_X, train_Y)
train_time1 = time.perf_counter() - start_time1

#Time 2
start_time2 = time.perf_counter()
y_train_pred = svclassifier.predict(train_X)  
y_val_pred = svclassifier.predict(val_X)
train_time2 = time.perf_counter() - start_time2

#Time 3
start_time3 = time.perf_counter()
train_error = 1 -  (accuracy_score(train_Y, y_train_pred, normalize=False) / float(train_Y.size))
val_error = 1 -  (accuracy_score(val_Y, y_val_pred, normalize=False) / float(val_Y.size))
train_time3 = time.perf_counter() - start_time3\

#Output
print("C: {},train error: {}, validation error: {}, time: {} {} {}".format(t, train_error, val_error, train_time1, train_time2, train_time3))

C: 1,train error: 0.02754000000000001, validation error: 0.057699999999999974, time: 346.60612960000003 585.2876257000003 0.003516499999932421


Cách thức:
- Đẩy nhanh thời gian thực thi thuật toán bằng cách chỉnh sửa các tham số cài đặt trong document của modul.
- Cài đặt chạy CPU đa nhân một cách tối đa. 

link tham khảo: https://stackoverflow.com/questions/31681373/making-svm-run-faster-in-python

In [7]:
import multiprocessing 

In [18]:
k = 0
t = 10**k
def run_SVM():
    #Time 1
    start_time1 = time.perf_counter()
    svclassifier = SVC(C = 10**k,kernel='linear')
    multiprocessing.Process(svclassifier.fit(train_X, train_Y))
    train_time1 = time.perf_counter() - start_time1
    print(train_time1)



In [19]:
from threading import Thread as worker

t = worker(target=run_SVM, args=(4,))
t.start()
t.join()

Exception in thread Thread-7:
Traceback (most recent call last):
  File "C:\Users\dell\anaconda3\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "C:\Users\dell\anaconda3\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
TypeError: run_SVM() takes 0 positional arguments but 1 was given

