###  Created by Luis A. Sanchez-Perez (alejand@umich.edu).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

A single forward and backward pass for a softmax regressor using graphs. Compares results to analytical computations.

In [1]:
import numpy as np
from graphs.core import Param
from graphs.core import DataHolder
from graphs.core import Graph
from graphs.core import Operation
from graphs.nodes import linear_node
from graphs.nodes import bias_node
from graphs.nodes import softmax_node
from graphs.nodes import mce_node
from graphs.nodes import softmax_mce_node
from sklearn import datasets

In [2]:
dataset = datasets.load_iris()
predictors = dataset['data']
responses = dataset['target'].reshape(-1,1)
m,d = predictors.shape
n = len(np.unique(responses))

In [3]:
# rnd.seed(1)
X_node = DataHolder()
y_node = DataHolder()
w_node = Param((d,n))
b_node = Param((1,n))

In [4]:
r_node = linear_node(X_node,w_node)
z_node = bias_node(r_node,b_node)
# J_node = softmax_mce_node(z_node,y_node)
h_node = softmax_node(z_node)
J_node = mce_node(h_node,y_node)

In [5]:
g = Graph()
g.build(J_node).initialize().feed({X_node:predictors, y_node:responses})

<graphs.core.Graph at 0x1f78d8bbc08>

In [6]:
# %%timeit
g.forward().backward()

<graphs.core.Graph at 0x1f78d8bbc08>

In [7]:
def softmax(elements):
    shift = elements.max(axis=1).reshape(-1,1)
    exp = np.exp(elements - shift)
    return exp / exp.sum(axis=1).reshape(-1,1)

def compute_cost(w,X,y):
    m,d = X.shape
    W = w.reshape((d,-1))
    prob = softmax(X.dot(W))
    loglikelihood = np.log(prob[range(m),y.flatten()]).sum() 
    return -loglikelihood

def compute_grad(w,X,y):
    # Reshape the weights into d,n
    m,d = X.shape
    W = w.reshape((d,-1))
    _,n = W.shape
    # Evaluates the indicator function for y (one-hot-encoding)
    indicator = np.zeros((m,n))
    indicator[range(len(y)),y.flatten()] = 1
    prob = softmax(X.dot(W))
    # Builds gradient
    diff = indicator - prob
    grad = np.zeros((d,n))
    for c in range(n):
        grad[:,c] = -(X * diff[:,c].reshape((m,1))).sum(axis=0)
    grad = grad.flatten()
    return grad

In [8]:
X = np.hstack((np.ones((m,1)), predictors))
w = np.vstack((b_node.value,w_node.value))
y = y_node.value

In [9]:
print('Convetional:', compute_cost(w,X,y))
print('Graph:', J_node.value)

Convetional: 953.8380680633793
Graph: 953.8380680633793


In [10]:
print('Conventional:', compute_grad(w,X,y).flatten())

Conventional: [ -41.29610729  -49.99950772   91.29561501 -196.22810625 -296.79761747
  493.02572372 -146.42605174 -138.49856304  284.92461479  -30.79117725
 -212.99911488  243.79029214    2.22312269  -66.29981066   64.07668796]


In [11]:
print('Graph:', np.hstack((b_node.gradient.flatten(), w_node.gradient.flatten())))

Graph: [ -41.29610729  -49.99950772   91.29561501 -196.22810625 -296.79761747
  493.02572372 -146.42605174 -138.49856304  284.92461479  -30.79117725
 -212.99911488  243.79029214    2.22312269  -66.29981066   64.07668796]
