In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import cvxopt.solvers
from cvxopt import matrix, solvers

In [2]:
data = pd.read_csv("data.txt", sep = ',')
data['Y'] = (data['Y']  == 'P').astype(int)

In [3]:
X = np.array(data.drop('Y', axis=1))
y = np.array(data['Y'])

In [4]:
train_ratio = 0.8
train_size = int(len(y) * train_ratio)
X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

In [5]:
rho = 1.e-4
eps = 1.e-5
tol = 1.e-5

In [6]:
def g(t, sigma):
  return np.tanh(t*sigma)

def dg(t, sigma):
    return sigma*(1-(g(t, sigma))**2)

In [7]:
def softmax(v):
  return np.exp(v) / np.sum(np.exp(v))

In [8]:
def predict(omega, x, N, H, sigma):
  v, w, b = get_param(omega, x, N, H)
  z = x.T
  for i in range(H):
    z = g(np.dot(w[i], z) + b[i], sigma)
  y = softmax(np.dot(v.T, z))
  return (y > 0).astype(int)

In [9]:
def loss(omega, x, y, N, H, sigma):
  y_pred = predict(omega, x, N, H, sigma)
  p = softmax(y_pred)
  loss_val = rho * np.linalg.norm(omega)**2
  if (p < 1).all() and (p > 0).all():
    loss_val += - np.sum(y * np.log(p) + (1 - y) * np.log(1 - p)) / len(y)
  return loss_val

In [10]:
def get_param(omega, x, N, H):
  w, b, n = [], [], x.shape[1]
  v = omega[:N].reshape(N, 1)
  w.append(omega[N:N*(n+1)].reshape(N, n))
  b.append(omega[N*(n+1):N*(n+2)].reshape(N, 1))
  omega = omega[N*(n+2):]
  for i in range(H-1):
      w.append(omega[:N**2].reshape(N, N))
      b.append(omega[N**2:N**2 + N].reshape(N, 1))
      omega = omega[N:]
  return v, w, b

In [24]:
def omega_init(x, N, H, sigma):
  omega = np.random.rand(N*(n+2) + (H-1)*N*(N+1)) - 0.5
  return omega

In [12]:
def dE_dz(p, y, j):
  return (p[j] * np.sum(y) - y[j] + p[j] * np.sum((1-y) * p / (p - 1)) - p[j] * (1 - y[j]) / (p[j] - 1)) / len(y)

def dz_dz(w, b, z, k, j, t, i, sigma): # dz_kji / dz_(k-1)jt
  return dg(z[k-1][j][t] * w[k-1][t][i] + b[k-1][i], sigma) * w[k-1][t][i]

def dz_dw(w, b, z, k, j, t, i, sigma): # dz_kji / dz_(k-1)ti
  return dg(z[k-1][j][t] * w[k-1][t][i] + b[k-1][i], sigma) * z[k-1][j][t]

def dz_db(w, b, z, k, j, i, sigma):
  d = 0
  for t in range(len(z[k-1][j])):
    d += dg(z[k-1][j][t] * w[k-1][t][i] + b[k-1][i], sigma)
  return d

def dz_k(v, w, b, z, k, j, t, sigma):
  if k == len(z) - 1:
    return v[t]
  return np.sum([dz_k(v, w, b, z, k+1, j, i, sigma) * dz_dz(w, b, z, k, j, t, i, sigma) for i in range(len(z[k][j]))]) / n

def gradient(omega, x, y, N, H, sigma):
  v, w, b = get_param(omega, x, N, H)
  y_hat = predict(omega, x, N, H, sigma)
  p = softmax(y_hat)
  n = len(y)
  z = [x]
  for i in range(H):
    w[i] = w[i].T
    b[i] = b[i].reshape(-1)
    z.append(g(z[-1] @ w[i] + b[i], sigma))
  dE = (p * np.sum(y) - y + p * np.sum((1 - y) * p / (p - 1)) - (1 - y) * p / (p - 1)) / n
  dv = (dE @ z[-1]) / n + (2 * rho * v).T
  print(dv)
  dw, db = [], []
  for h in range(H):
    dw.append([np.sum([dE_dz(p, y, j) * dz_k(v, w, b, z, h+1, j, i, sigma) * dz_dw(w, b, z, h+1, j, t, i, sigma) for j in range(n)])/n + 2 * rho * w[h][t][i] for t in range(len(w[h])) for i in range(len(w[h][t]))])
    db.append([np.sum([dE_dz(p, y, j) * dz_k(v, w, b, z, h+1, j, i, sigma) * dz_db(w, b, z, h+1, j, i, sigma) for j in range(n)])/n + 2 * rho * b[h][i] for i in range(len(b[h]))])
  return dv, dw, db

In [13]:
def pack_param(dL_dv, dL_dw, dL_db):
    grad = np.concatenate((dL_dv.flatten(),))
    for dw in dL_dw:
        grad = np.concatenate((grad, dw.flatten()))
    for db in dL_db:
        grad = np.concatenate((grad, db.flatten()))
    return grad

def check_grad_loss(omega, x, y, N, H, sigma, epsilon=1e-6):
    grad = grad_loss(omega, x, y, N, H, sigma)
    num_grad = np.zeros_like(grad)
    for i in range(len(omega)):
        omega[i] += epsilon
        num_grad[i] = (loss(omega, x, y, N, H, sigma) - loss(omega - 2 * epsilon, x, y, N, H, sigma)) / (2 * epsilon)
        omega[i] -= epsilon
    return np.allclose(grad, num_grad)

def approx_grad_loss(omega, x, y, N, H, sigma, epsilon=1e-6):
    num_grad = np.zeros_like(omega)
    for i in range(len(omega)):
        omega[i] += epsilon
        num_grad[i] = (loss(omega, x, y, N, H, sigma) - loss(omega - 2 * epsilon, x, y, N, H, sigma)) / (2 * epsilon)
        omega[i] -= epsilon
    return num_grad

def grad_loss(omega, x, y, N, H, sigma):
    v, w, b = get_param(omega, x, N, H)
    z = [x.T]
    for i in range(H):
        z.append(g(np.dot(w[i], z[-1]) + b[i], sigma))
    y_pred = np.dot(v.T, z[-1])
    p = softmax(y_pred)
    
    dL_dv = np.dot(z[-1], (p - y).T) / len(y) + 2 * rho * v
    dL_dw = []
    dL_db = []
    dL_dz = np.dot(v, p - y)
    for i in range(H-1, -1, -1):
        dL_db.insert(0, np.sum(dL_dz * dg(z[i+1], sigma), axis=1) / len(y))
        dL_dw.insert(0, np.dot(dL_dz * dg(z[i+1], sigma), z[i].T) / len(y) + 2 * rho * w[i])
        dL_dz = np.dot(w[i].T, dL_dz * dg(z[i+1], sigma))
    
    return pack_param(dL_dv, dL_dw, dL_db)


In [14]:
def dE(omega, x, y, N, H, sigma):
    approx_gradient = np.zeros(omega.shape)
    delta_omega = np.zeros(omega.shape)
    for i in range(len(omega)):
        delta_omega[i] = eps
        approx_gradient[i] = (loss(omega + delta_omega, x, y, N, H, sigma)-loss(omega - delta_omega, x, y, N, H, sigma))/(2*eps)
        delta_omega[i] = 0
    return approx_gradient

In [57]:
n = len(X_train[0])
N = 3
H = 2
sigma = 0.5
omega = omega_init(X_train, N, H, sigma)
print(loss(omega, X_train, y_train, N, H, sigma))
y_hat = predict(omega, X_train, N, H, sigma)

0.8125542005392987


In [62]:
res = minimize(loss, omega, args=(X_train, y_train, N, H, sigma), method='L-BFGS-B', jac=approx_grad_loss)
print(res)

      fun: 0.0006561105307944632
 hess_inv: <66x66 LbfgsInvHessProduct with dtype=float64>
      jac: array([-4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
       -4167705.09816098, -4167705.09816098, -4167705.09816098,
  

In [63]:
omega = res.x
predictions = predict(omega, X_test, N, H, sigma).reshape(-1)

In [64]:
from sklearn.metrics import accuracy_score
print(predictions, y_test)
print(accuracy_score(predictions, y_test))
print((predictions == 1).sum(), (predictions == 0).sum())

[0 0 0 ... 0 0 0] [0 0 0 ... 0 0 0]
0.958
0 4000


In [65]:
accuracy_score(4000*[0], y_test)

0.958