Implement **`one-vs-all logistic regression and neural networks to recognize handwritten digit`**

In [None]:
import numpy  as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy
import scipy.optimize as opt
import scipy.io as sio
from scipy.io import loadmat

%matplotlib inline

# Multi-Class Classification

Extend your previous implementation of logistic regression and apply it to **`one-vs-all classification`**.

###### Load dataset

In [None]:
path = "dataset/ex3data1.mat"
ori_data = sio.loadmat(path)
X_ori = ori_data.get('X')
y_ori = ori_data.get('y').reshape((-1,))
m = y_ori.shape[0]
print(">> X_ori.shape :", X_ori.shape)
print("   y_ori.shape :", y_ori.shape)
print("   Samples :", m)
print("   y classes :", np.unique(y_ori))

## Visualizing the data

There are 5000 training examples in `ex4data1.mat`, where each training example is `a 20 pixel by 20 pixel grayscale image of the digit`. Each pixel is represented by a floating point number indicating the grayscale intensity at that location. The 20 by 20 grid of pixels is “unrolled” into a 400-dimensional vector. Each of these training examples becomes a single row in our data matrix $X$. This gives us a 5000 by 400 matrix $X$ where every row is a training example for a handwritten digit image.

$$ X = \begin{bmatrix} - \left(x^{(1)} \right)^T - \\
- \left(x^{(2)} \right)^T - \\
\vdots \\
- \left(x^{(m)} \right)^T - \\
\end{bmatrix}
$$

###### form data for visualize

###### visualize one image

In [None]:
def visualize_one_image(image, method=0):
    fig, ax = plt.subplots(figsize=(1, 1))
    if method == 0:
        ax.matshow(image.reshape((20, 20)), cmap=mpl.cm.binary)
    elif method == 1:
        plt.imshow(image.reshape((20, 20)), cmap=mpl.cm.binary)
    plt.xticks(np.array([]))  # just get rid of ticks
    plt.yticks(np.array([]))

In [None]:
#pick_one = np.random.choice(5000, 1)
pick_one = np.random.randint(0, 5000)
visualize_one_image(X_img[pick_one, :])
print('this should be {}'.format(y_img[pick_one]))

###### visualize all classes data

In [None]:
y_one_class = y_ori.copy()
y_one_class[y_one_class == 10] = 0

In [None]:
classes = np.arange(0, 10)
num_classes = len(classes)
sample_each_class = 1

for y, cla in enumerate(classes):
    idxs = np.array(np.where(y_one_class == y)).reshape((-1,))
    idxs = np.random.choice(idxs, sample_each_class, replace=False)

    for i, idx in enumerate(idxs):
        plt_idx = i * num_classes + y + 1
        plt.subplot(sample_each_class, num_classes, plt_idx)
        plt.imshow(X_img[idx].reshape((20, 20)), cmap=mpl.cm.binary)
        plt.axis('off')
        if i == 0:
            plt.title(cla)


##### form training data

In [None]:
def form_train_data(X_ori, y_ori):
    X_train = np.insert(X_ori, 0, 1, axis=1)
    
    # transform vector-y to one-hot
    y_matrix = []
    for k in range(1,11):
        y_matrix.append((y_ori == k).astype(int))
        # y_matrix.append([1 if label == k else 0 for label in y])
    y_matrix = [y_matrix[-1]] + y_matrix[:-1]
    y_train = np.array(y_matrix)
    
    print("X_train.shape :", X_train.shape)
    print("y_train.shape :", y_train.shape)
    
    return X_train, y_train

X_train, y_train = form_train_data(X_ori, y_ori)
theta_init_zero = np.zeros((X_train.shape[1],))

## Vetor regularization logistic regression

**`Using multiple one-vs-all logistic regression models to build a multi-class classifier`**.

`Since there are 10 classes, you will need to train 10 separate logistic regression classifiers`. 

In [None]:
y_choose_lamd = y_ori.copy()
sk_X_train, sk_X_test, sk_y_train, sk_y_test = train_test_split(
    X_train, y_choose_lamd, test_size=0.3, random_state=0
)
sk_X_img, sk_y_img = form_data_for_visualize(sk_X_train[:,1:], sk_y_train)

In [None]:
lamdas = np.linspace(0.01, 1000, 5)

acu_train = np.empty(len(lamdas))
acu_cv = np.empty(len(lamdas))

for i, lamda in enumerate(lamdas):
    model = LogisticRegression(penalty='l2', C=lamda, fit_intercept=False, multi_class='ovr')
    model.fit(X_train, y_ori)
    
    # calc accuracy - method-01
    hyp_prd_train = hypothesis(model.coef_.T, sk_X_train)
    y_prd_train   = np.argmax(hyp_prd_train, axis=1) + 1
    y_answer_train = sk_y_train.copy()
    corret_train = [1 if a == b else 0 for (a,b) in zip (y_answer_train, y_prd_train)]
    acu_train[i]   = np.mean(corret_train)
    
    hyp_prd_cv = hypothesis(model.coef_.T, sk_X_test)
    y_prd_cv   = np.argmax(hyp_prd_cv, axis=1) + 1
    y_answer_cv = sk_y_test.copy()
    corret_cv = [1 if a == b else 0 for (a,b) in zip (y_answer_cv, y_prd_cv)]
    acu_cv[i]   = np.mean(corret_cv)

    # calc accuracy - method-02
    # acu_train[i] = model.score(sk_X_train, sk_y_train)
    # acu_cv[i] = model.score(sk_X_test, sk_y_test)

    print('>>>  Lamd = %f' % (1 / lamda))
    print('     train acu = %.3f%%' % (acu_train[i] * 100))
    print('     cv acu    = %.3f%%' % (acu_cv[i] * 100))
    print()
    
    # plot digit-0
    digit_show = 9
    ind = np.where(sk_y_train == digit_show)
    visualize_one_image(sk_X_img[ind][0])
    axes = plt.gca()
    axes.set_title('digit - %d' % (digit_show))
    
    # plot error predict digit
    inct_ind = np.where(y_answer_train != y_prd_train)
    visualize_one_image(sk_X_img[inct_ind][0])
    axes = plt.gca()
    axes.set_title('predict as %d, ori - %d' % (y_prd_train[inct_ind][0] , y_answer_train[inct_ind][0]))

In [None]:
plt.plot(lamdas, acu_train, lamdas, acu_cv)
ind = np.where(acu_cv == np.amax(acu_cv))
print(ind[0][0])
optLambda = lamdas[ind[0][0]]
plt.scatter(optLambda, acu_cv[ind[0][0]], label="best lamd - {%f - %0.4f}" % (optLambda, acu_cv[ind[0][0]]))
plt.legend()

# Neural Network

    However, logistic regression can't form more complex hypothesis as it is only a linear classifier

<img style="float: left;" src="dataset/nn_model.png">

In [None]:
def load_weight(path):
    weights = sio.loadmat(path)
    print('weight.keys()', weights.keys())
    print('weights[\'Theta1\'].shape', weights['Theta1'].shape)
    print('weights[\'Theta2\'].shape', weights['Theta2'].shape)
    
    return weights['Theta1'], weights['Theta2']

In [None]:
weights_path = "dataset/ex3weights.mat"
theta1, theta2 = load_weight(weights_path)

theta - (out, in)
z - (m, n)
Z - (m, out)

## layer()

In [None]:
J_history = []
theta_init_zero = np.zeros((X_train.shape[1],))

In [None]:
def sigmoid(Z):
    return scipy.special.expit(Z)

In [None]:
def hypothesis(theta, X):
    return sigmoid(X @ theta)

+ Normal : $ J\left( \theta  \right)=-\frac{1}{m}\sum\limits_{i=1}^{m}{[{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)+\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]} + \frac{\lambda }{2m}\sum\limits_{j=1}^{n}{{\theta }_{j}^{2}} $
+ Vector : $ J(\theta) = -\frac{1}{m}\big((\,log\,(g(X\theta))^Ty+(\,log\,(1-g(X\theta))^T(1-y)\big) + \frac{\lambda }{2m}\sum\limits_{j=1}^{n}{{\theta }_{j}^{2}} $
> + theta - (n+1, )
> + X - (m, n+1)
> + y - (m, )
> + scalars

In [None]:
def replace_zeros(data):
    if np.count_nonzero(data):
        min_nonzero = np.min(data[np.nonzero(data)])
    else:
        min_nonzero = 0.000000000001
    data[data == 0] = min_nonzero
    
    return data


def cost_function(theta, X, y, lamd=0):
    global J_history
    
    hyp = hypothesis(theta, X)
    y_1 = np.log(replace_zeros(hyp)).T @ y
    y_0 = np.log(replace_zeros(1-hyp)).T @ (1-y)
    cost = -(y_1 + y_0) / y.size
    
    reg_item = lamd * np.mean(np.power(theta[1:], 2)) / (2 * y.size)
    cost_reg = cost + reg_item
    
    """
        if np.isnan(cost_reg):
            cost_reg = np.inf
    """

    J_history.append(cost_reg)
    
    return cost_reg

In [None]:
cost_function(theta_init_zero, X_train, y_train[0])

+ Vector : $$ \frac{\delta J(\theta)}{\delta\theta_{j}} = \frac{1}{m} X^T(g(X\theta)-y) + \frac{\lambda}{m}\theta_{j}$$ 
##### $$\text{Note: intercept parameter } \theta_{0} \text{ is not to be regularized}$$
> + theta - (n+1, )
> + X - (m, n+1)
> + y - (m, )
> + (n+1, )

In [None]:
def gradient(theta, X, y, lamd=0):
    err = hypothesis(theta, X) - y
    grad = X.T @ err / y.size
    
    reg_theta = lamd * theta[1:] / y.size
    reg_item = np.r_[[0], reg_theta]
    
    grad_reg = grad + reg_item
    
    return grad_reg

In [None]:
def scipy_opt_minimize(theta, X, y, Method, lamd=0):
    global J_history
    J_history = []
    res = opt.minimize(
        fun    = cost_function,
        x0     = theta,
        args   = (X, y, lamd),
        jac    = gradient,
        method = Method
    )
    
    return {
        "result"  : res,
        "theta"   : res.x,
        "success" : res.success,
        "cost_history" : J_history.copy(),
        "name" : Method,
        "lamd" : lamd
    }

## Training One Class Data

In [None]:
hyp_prd = hypothesis(theta_scipy_opt.T, X_train)
y_prd   = np.argmax(hyp_prd, axis=1)

In [None]:
y_answer = y_ori.copy()
y_answer[y_answer == 10] = 0

In [None]:
print(classification_report(y_answer, y_prd))

#### np.mean()

In [None]:
corret = [1 if a == b else 0 for (a,b) in zip (y_answer, y_prd)]
accu   = sum(map(int, corret)) / float(len(corret))
print('Accuracy = {0}%'.format(accu * 100))

#### model.score()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(penalty='l2', C=1.0, fit_intercept=False, multi_class='ovr')
model.fit(X_train, y_ori)

In [None]:
model.coef_.shape

In [None]:
model.score(X_train, y_ori)

### choose best lamd

In [None]:
y_matrix = []
y_matrix.append([1 if label == 10 else 0 for label in y_ori])
y_vector_for_accuary_digit_0 = np.array(y_matrix).reshape((-1,))

In [None]:
y_vector_for_accuary_digit_0.shape

In [None]:
hyp_digit_0 = hypothesis(res_digit_0["theta"], X_train)

In [None]:
print(classification_report(y_vector_for_accuary_digit_0, hyp_digit_0.round().astype(int)))

## Training all Class Data

    Since there are 10 classes, you will need to train 10 separate logistic regression classifiers, one for each of the K classes in our dataset

In [None]:
y_train.shape

In [None]:
y_train

In [None]:
theta_scipy_opt = []
for i in range(10):
    res = scipy_opt_minimize(theta_init_zero, X_train, y_train[i], "BFGS", lamd=1)
    theta_scipy_opt.append(res["theta"])

theta_scipy_opt = np.array(theta_scipy_opt)

### Accuracy

#### classification_report

In [None]:
theta_scipy_opt.shape

In [None]:
def form_data_for_visualize(X, y):
    # for this dataset, you need a transpose to get the orientation right
    X_img = np.array([im.reshape((20, 20)).T for im in X])
    # and I flat the image again to preserve the vector presentation
    X_img = np.array([im.reshape(400) for im in X_img])
    y_img = y
    
    return X_img, y_img

###### visualize all data

In [None]:
def visualize_data(X, y):
    img_size = int(np.sqrt(X.shape[1]))
    
    sample_idx = np.sort(np.random.choice(5000, 100))
    sample_img = X[sample_idx, :]
    
    fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))
    for r in range(10):
        for c in range(10):
            ax_array[r, c].matshow(sample_img[10 * r + c].reshape((img_size, img_size)),
                                   cmap=mpl.cm.binary) # Greys_r
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))

In [None]:
X_img, y_img = form_data_for_visualize(X_ori, y_ori)
visualize_data(X_img, y_img)