In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from scipy.io import loadmat
import pymc3 as pm



In [13]:
def standardize(x):
    std = np.maximum(np.std(x, axis=0), 1/np.sqrt(len(x)))
    mean = np.mean(x, axis=0)
    return (x - mean) / std

def relu(x):
    return np.maximum(0, x)

def one_hot_encoding(label, n_class):
    y = np.zeros([len(label), n_class])
    for i in range(len(label)):
        y[i, label[i]] = 1
    return y

def softmax(x):
    return np.exp(x) / np.repeat((np.sum(np.exp(x), axis=1))[:, np.newaxis], len(x[0]), axis=1)

In [10]:
n_node = 10
n_iter = 1000
w_range = [-1, 1] # range of random weights
b_range = [0, 1] # range of random biases
alpha_1 = 10**(-5) # Gamma distribution parameter
alpha_2 = 10**(-5)
alpha_3 = 10**(-5)
alpha_4 = 10**(-5)
tol = 1.0e-3

dataset = loadmat('coil20.mat')
label = np.array([dataset['Y'][i][0] - 1 for i in range(len(dataset['Y']))])
data = dataset['X']
n_class = 20

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=42)
# kf = KFold(10, True, 1)
val_acc = []
max_index = -1

X_train = standardize(X_train)
n_sample, n_feature = np.shape(X_train)
y = one_hot_encoding(y_train, n_class)

weights = (w_range[1] - w_range[0]) * np.random.random([n_feature, n_node]) + w_range[0]
bias = (b_range[1] - b_range[0]) * np.random.random([1, n_node]) + b_range[0]

### 1) Initialization
a) Compute $\mathbf{D}$ where $\mathbf{D}=\mathbf{[H,X]}$ <br>

In [4]:
h = relu(np.dot(X_train, weights) + np.dot(np.ones([n_sample, 1]), bias))
d = np.concatenate([h, X_train], axis=1)
# d = np.concatenate([d, np.ones_like(d[:, 0:1])], axis=1) # concat column of 1s

b) Compute $\mathbf{D}^T\mathbf{y}, \mathbf{D}^T\mathbf{D}$, and its eigenvalues $\lambda^0_1,\dots,\lambda^0_B$ <br>

In [5]:
dT_y = np.dot(d.T, y)
dT_d = np.dot(d.T, d)
eigen_val = np.linalg.eigvalsh(dT_d)
eigen_val

array([-1.22371836e-11, -2.28070375e-12, -2.09044845e-12, ...,
        2.23715050e+05,  3.83469564e+05,  7.23848649e+05])

c) Initialize $\sigma^2$ and $\gamma$ to default values <br>
Evidence approximation (MAP estimation on the posterior of the hyper-parameters):
$$p(\gamma)=\text{Gamma}(\gamma \mid \alpha_1, \alpha_2)$$
$$p(\sigma^2)=\text{Gamma}(\sigma^{-2} \mid \alpha_3, \alpha_4)$$
$$ \sigma_*^2, \gamma_*^2 = \arg\max \left\{ \int_{\mathbf{R}^B} p(\mathbf{y} \mid \mathbf{X}, \mathbf{\beta}, \sigma^2)p(\mathbf{\beta} \mid \gamma) p(\gamma)p(\sigma^2)\,d\beta \right\}$$

In [7]:
# Evidence approximation
gamma = pm.Model()
with gamma:
    prec = pm.Gamma('prec', alpha=alpha_1, beta=alpha_2)
    var = pm.Gamma('var', alpha=alpha_3, beta=alpha_4)
    beta = pm.Normal('beta', mu=0, tau=prec, shape=(n_feature + n_node, n_class))
    y_obs = pm.Normal('y_obs', mu=pm.math.dot(d, beta), tau=var, observed=y)
    
map_estimate =  pm.find_MAP(model=gamma)







In [9]:
prec, var, beta = map_estimate['prec'].item(0), map_estimate['var'].item(0), map_estimate['beta']
print('prec: ', prec)
print('var: ', var)
print('beta: ', beta)

prec:  1209.5690847757458
var:  18375.41202149414
beta:  [[ 1.42841716e-03  3.76067839e-03  3.77665433e-03 ...  1.49610113e-03
  -4.94688282e-03 -8.51206940e-05]
 [-2.24157842e-03  3.22316929e-03  1.31172151e-03 ... -3.11610311e-03
  -1.86679358e-03  1.53117666e-03]
 [-1.96457684e-03 -2.95730023e-03 -3.44664079e-03 ...  1.43150120e-03
   1.79082857e-04  1.34722087e-03]
 ...
 [ 2.12737483e-02  1.70308621e-02  1.31665476e-02 ...  1.97599765e-02
  -4.18342879e-02 -1.17895118e-02]
 [ 1.28352168e-02  4.08538074e-02  6.00032460e-03 ...  1.21249575e-03
  -1.96963041e-02 -7.09148986e-03]
 [ 1.31309459e-02  3.98866744e-02  9.54498733e-03 ... -3.60943991e-03
  -2.55079821e-02 -6.65720825e-03]]


### 2) Posterior update
a) Computer posterior mean $\mathbf{m}$ as in $$\mathbf{m}=\frac{1}{\sigma^2}\Sigma\mathbf{D}^T\mathbf{y}$$ <br>
b) Computer posterior covariance $\Sigma$ as in $$\Sigma^{-1}=\gamma\mathbf{I}+\frac{1}{\sigma^2}\mathbf{D}^T\mathbf{D}$$

### 3) Hyper-parameters update
a) Compute updated eigenvalues $$\lambda_i=\frac{1}{\sigma^2}\lambda^0_i,\qquad i=1,\dots,B$$ <br>

b) Update $\gamma$ as in $$\gamma=\frac{\delta + 2\alpha_1}{\|\mathbf{m}\|^2_2 + 2\alpha_2}$$
c) Update $\sigma^2$ as in $$\sigma^2=\frac{\|\mathbf{y}-\mathbf{D}\beta\|^2_2+\alpha_4}{N-\delta+2\alpha_3}$$
where $\delta$ is defined as $$\delta=\sum^B_{i=1}\frac{\lambda_i}{\gamma+\lambda_i}$$

In [11]:
mean_prev = None

for iter_ in range(n_iter):
    # Posterior update
    # update posterior covariance
    covar = np.linalg.inv(prec * np.identity(dT_d.shape[1]) + dT_d / var)
    # update posterior mean
    mean = np.dot(covar, dT_y) / var

    # Hyperparameters update
    # update eigenvalues
    lam = eigen_val / var
    # update precision and variance 
    delta = np.sum(np.divide(lam, lam + prec))
    prec = (delta + 2 * alpha_1) / (np.sum(np.square(mean)) + 2 * alpha_2)
    var = (np.sum(np.square(y - np.dot(d, beta))) + alpha_4) / (n_sample + delta + 2 * alpha_3)

    # Check for convergence
    if iter_ != 0 and np.sum(np.abs(mean_prev - mean)) < tol:
        print("Convergence after ", str(iter_), " iterations")
        break
    mean_prev = np.copy(mean)

# Final Posterior update
# update posterior covariance
covar = np.linalg.inv(prec * np.identity(dT_d.shape[1]) + dT_d / var)
# update posterior mean
mean = np.dot(covar, dT_y) / var

print('Posterior mean: ', mean)
print('Posterior covariance: ', covar)
print('Precision: ', prec)
print('Variance: ', var)

Convergence after  17  iterations
Posterior mean:  [[ 1.69350329e-03  4.46255580e-03  3.49272768e-03 ...  2.11635349e-03
  -6.59539646e-03  3.61353008e-04]
 [-1.70940223e-03  2.46115771e-03  8.43553325e-04 ... -3.99594234e-03
  -1.06207711e-03  2.00993749e-03]
 [ 1.18596342e-03 -6.05819649e-03 -3.10780689e-03 ...  3.12524812e-03
   1.55826320e-03  1.42490002e-03]
 ...
 [ 5.20126389e-02 -4.23346659e-02  5.55808571e-02 ...  6.79668677e-02
  -1.40190191e-01 -3.09014476e-02]
 [ 9.80171046e-03  4.23311448e-02  1.93670024e-02 ... -3.86976670e-05
  -2.73713768e-02 -1.23023494e-02]
 [ 1.07989846e-02  4.04681096e-02  2.85074590e-02 ... -5.71810845e-03
  -4.42219636e-02 -1.18106632e-02]]
Posterior covariance:  [[ 6.89936171e-06  1.57516783e-06 -1.48509975e-06 ... -3.46087387e-06
  -4.96073629e-06 -3.57272619e-06]
 [ 1.57516783e-06  1.17384007e-05 -1.29846869e-06 ...  2.46596597e-05
   8.62558334e-07 -3.48471184e-06]
 [-1.48509975e-06 -1.29846869e-06  1.31969763e-05 ...  7.75706910e-06
   6.93156

In [14]:
# Prediction
X_test = standardize(X_test)
n_test_samp = len(X_test)
h_test = relu(np.dot(X_test, weights) + np.dot(np.ones([n_test_samp, 1]), bias))
d_test = np.concatenate([h_test, X_test], axis=1)
result = softmax(np.dot(d_test, beta))
result = np.argmax(result, axis=1)

In [15]:
acc = np.sum(np.equal(result, y_test))/len(y_test)
acc

0.8888888888888888