In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

In [104]:
mnist_X, mnist_label = load_svmlight_file('Data/mnist.scale.bz2')
mnist_X = mnist_X.toarray()
mnist_label = mnist_label.astype(int)

X = mnist_X[:]
label = mnist_label[:]

In [139]:
def binary_converter(y):
    """
    Convert decimal classes to binary
    """
    decimal_to_binary = {0: '0000', 1: '0001', 2: '0010', 3: '0011', 4: '0100', 5: '0101', 6: '0110', 
                  7: '0111', 8: '1000', 9: '1001'}

    ecoc_labels = np.vectorize(decimal_to_binary.get)(y)
    Y_1 = np.array([int(e[0]) for e in ecoc_labels])
    Y_2 = np.array([int(e[1]) for e in ecoc_labels])
    Y_3 = np.array([int(e[2]) for e in ecoc_labels])
    Y_4 = np.array([int(e[3]) for e in ecoc_labels])
        
    return Y_1, Y_2, Y_3, Y_4

def decimal_converter(y):
    """
    Binary to decimal
    """
    binary_to_decimal = {'0000': 0, '0001': 1, '0010':2, '0011': 3, '0100': 4, '0101': 5, '0110': 6, 
                  '0111': 7, '1000': 8, '1001': 9, '1010': 9, '1011': 9, '1100': 9, '1101': 9, 
                  '1110': 9, '1111': 9}

    converted_labels = np.vectorize(binary_to_decimal.get)(y)
    
    return converted_labels

In [140]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(X, label,
                                                   test_size=0.3,
                                                   random_state=0)

In [146]:
len(x_train)

42000

In [130]:
y_train1, y_train2, y_train3, y_train4 = binary_converter(y_train) #get labels for ECOC
y_test1, y_test2, y_test3, y_test4 = binary_converter(y_test) #get labels for ECOC

In [131]:
# x = np.random.rand(100, 100) 
# y = np.array([-1]*50+[1]*50)
# np.random.shuffle(y)

In [235]:
class LinearSVM:
    def __init__(self, L=1, T=10, n_features=4):
        self.L = L
        self.T = T
        self.w = np.zeros((n_features, ))
        
    def fit(self, x, y):
        """
        Train Linear SVM(Pegasos) using Subgradient.
        """
        def change_y(vals):
            for v in range(len(vals)):                
                if vals[v]==1:
                    vals[v]=1
                if vals[v]==0:
                    vals[v]=-1
            return vals
        
        S = len(y)
        y = change_y(y)
        
        for t in range(self.T):
            i_t = np.random.randint(0, S-1)
            n_t = 1/(self.L*(t+1)) #add 1 to avoid division by zero
            w_dot_x = y[i_t]*np.dot(self.w, x[i_t])
            if (w_dot_x < 1):
                self.w = (1 - n_t*self.L)*self.w + n_t*y[i_t]*x[i_t]
            elif (w_dot_x >= 1):
                self.w = (1 - n_t*self.L)*self.w
                
    def predict(self, x):
        """
        Using the trained weights predict on new data.
        """
        predictions = list()
        n_samples = x.shape[0]
        for i in range(n_samples):
            if (np.dot(self.w, x[i]) >= 0):
                predictions.append(1)
            if (np.dot(self.w, x[i]) < 0):
                predictions.append(0)
        return predictions        

In [216]:
# def linearSVM(x, y, S=4, L=1, T=10):
#     w = np.zeros((x.shape[1], ))
#     for t in range(T):
#         i_t = np.random.randint(0, x.shape[0])
#         n_t = 1/(L*(t+1)) #add 1 to avoid division by zero
#         if (y[i_t]*np.dot(w, x[i_t]) < 1):
#             w = (1 - n_t*L)*w + n_t*y[i_t]*x[i_t]
#         elif ((y[i_t]*np.dot(w, x[i_t])) >= 1):
#             w = (1 - n_t*L)*w
#     return w

# linearSVM(x, y)

In [252]:
# S - number of samples in the dataset - used to get an idex uniformly at random 
# T - Number of iterations to be done over the dataset
model_1 = LinearSVM(L=0.01, T=5000000, n_features=780)
model_2 = LinearSVM(L=0.01, T=5000000, n_features=780)
model_3 = LinearSVM(L=0.01, T=5000000, n_features=780)
model_4 = LinearSVM(L=0.01, T=5000000, n_features=780)

In [253]:
model_1.fit(x_train, y_train1)
model_2.fit(x_train, y_train2)
model_3.fit(x_train, y_train3)
model_4.fit(x_train, y_train4)

In [254]:
pred_1 = model_1.predict(x_test)
pred_2 = model_2.predict(x_test)
pred_3 = model_3.predict(x_test)
pred_4 = model_4.predict(x_test)

In [255]:
meta_test = [y_test1, y_test2, y_test3, y_test4]
test_array = [''.join(str(item) for item in column) for column in zip(*meta_test)]
test_array = decimal_converter(test_array)


meta_pred = [pred_1, pred_2, pred_3, pred_4]
pred_array = [''.join(str(item) for item in column) for column in zip(*meta_pred)]
pred_array = decimal_converter(pred_array)

In [256]:
accuracy= list()
for i in range(len(pred_array)):
    if pred_array[i] == test_array[i]:
        accuracy.append(1)
    else:
        accuracy.append(0)

In [257]:
sum(accuracy)/len(accuracy)*100

69.45

In [259]:
confusion_matrix(pred_array, test_array)

array([[1506,   43,  115,   28,   59,  137,   67,    9,  292,   71],
       [  55, 1937,   23,  249,   29,  301,    3,   52,  200,  140],
       [ 101,   11, 1327,   86,   10,   19,  127,   18,   44,    9],
       [  10,   34,  101, 1275,    1,   40,    1,  262,   25,   27],
       [  77,    3,   17,    5, 1144,  169,  110,   17,   57,   22],
       [   7,    7,    5,   19,  188,  657,    7,  215,   46,  168],
       [  32,    3,   85,   17,   74,   51, 1425,   30,   11,   11],
       [   0,    3,   17,   81,    5,   67,    3, 1130,    3,   35],
       [  18,   17,   38,   11,   30,   54,    9,    4,  919,  120],
       [   6,   12,   21,   76,  190,   96,    9,  132,  190, 1181]],
      dtype=int64)

In [260]:
pd.crosstab(pred_array, test_array)

col_0,0,1,2,3,4,5,6,7,8,9
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1506,43,115,28,59,137,67,9,292,71
1,55,1937,23,249,29,301,3,52,200,140
2,101,11,1327,86,10,19,127,18,44,9
3,10,34,101,1275,1,40,1,262,25,27
4,77,3,17,5,1144,169,110,17,57,22
5,7,7,5,19,188,657,7,215,46,168
6,32,3,85,17,74,51,1425,30,11,11
7,0,3,17,81,5,67,3,1130,3,35
8,18,17,38,11,30,54,9,4,919,120
9,6,12,21,76,190,96,9,132,190,1181
