###  Created by Luis A. Sanchez-Perez (alejand@umich.edu).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

A single forward and backward pass for a softmax regressor using graphs. Compares results to analytical computations.

In [1]:
import numpy as np
from graphs.core import Param
from graphs.core import DataHolder
from graphs.core import Graph
from graphs.core import Operation
from graphs.nodes import linear_node
from graphs.nodes import bias_node
from graphs.nodes import softmax_node
from graphs.nodes import mce_node
from graphs.nodes import softmax_mce_node
from sklearn import datasets
import tensorflow as tf

### Loading dataset

In [2]:
dataset = datasets.load_iris()
predictors = dataset['data']
responses = dataset['target'].reshape(-1,1)
m,d = predictors.shape
n = len(np.unique(responses))

### Creating custom computational graph
Here we use our custom toy implemenations and perform one forward and backward pass computing the gradient.

In [3]:
# rnd.seed(1)
X_node = DataHolder()
y_node = DataHolder()
w_node = Param((d,n))
b_node = Param((1,n))

In [4]:
r_node = linear_node(X_node,w_node)
z_node = bias_node(r_node,b_node)
# J_node = softmax_mce_node(z_node,y_node)
h_node = softmax_node(z_node)
J_node = mce_node(h_node,y_node)

In [5]:
g = Graph()
g.build(J_node).initialize().feed({X_node:predictors, y_node:responses})

<graphs.core.Graph at 0x23384babf88>

In [6]:
# %%timeit
g.forward().backward()

<graphs.core.Graph at 0x23384babf88>

### Analytical implementations
Here we use the known formulas for softmax regression to compute cost and gradient. Notice that this is possible only because the graph for softmax regression is a really simple one. If we had a more complex graph deriving these formulas analytically will be impossible (or not viable).

In [7]:
def softmax(elements):
    shift = elements.max(axis=1).reshape(-1,1)
    exp = np.exp(elements - shift)
    return exp / exp.sum(axis=1).reshape(-1,1)

def compute_cost(w,X,y):
    m,d = X.shape
    W = w.reshape((d,-1))
    prob = softmax(X.dot(W))
    loglikelihood = np.log(prob[range(m),y.flatten()]).sum() 
    return -loglikelihood

def compute_grad(w,X,y):
    # Reshape the weights into d,n
    m,d = X.shape
    W = w.reshape((d,-1))
    _,n = W.shape
    # Evaluates the indicator function for y (one-hot-encoding)
    indicator = np.zeros((m,n))
    indicator[range(len(y)),y.flatten()] = 1
    prob = softmax(X.dot(W))
    # Builds gradient
    diff = indicator - prob
    grad = np.zeros((d,n))
    for c in range(n):
        grad[:,c] = -(X * diff[:,c].reshape((m,1))).sum(axis=0)
    grad = grad.flatten()
    return grad

In [8]:
X = np.hstack((np.ones((m,1)), predictors))
w = np.vstack((b_node.value,w_node.value))
y = y_node.value

### Building tensorflow graph
Here we use tensorflow to create the same computational graph and performs a forward and backward pass computing the gradient.

In [9]:
X_tensor = tf.convert_to_tensor(predictors, dtype=tf.float64)
y_tensor = tf.convert_to_tensor(responses.ravel(), dtype=tf.int64)
w_tensor = tf.Variable(w_node.value, dtype=tf.float64)
b_tensor = tf.Variable(b_node.value, dtype=tf.float64)

In [10]:
with tf.GradientTape() as tape:
    r_tensor = tf.matmul(X_tensor, w_tensor)
    z_tensor = tf.add(r_tensor, b_tensor)
    J_tensor = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=z_tensor, labels=y_tensor))
gradients = tape.gradient(J_tensor, [b_tensor, w_tensor])

### Comparisons!

In [11]:
print('Convetional:', compute_cost(w,X,y).ravel())
print('Graph (Custom):', J_node.value.ravel())
print('Tensorflow:', J_tensor.numpy().ravel())

Convetional: [584.72821298]
Graph (Custom): [584.72821298]
Tensorflow: [584.72821298]


In [12]:
print('Conventional:', compute_grad(w,X,y).flatten())

Conventional: [  90.74210564  -49.93176266  -40.81034298  575.77571779 -296.43786207
 -279.33785572  259.1622831  -138.30173991 -120.86054318  463.91529332
 -212.79977319 -251.11552013  160.15901713  -66.24346762  -93.91554951]


In [13]:
print('Graph (Custom):', np.hstack((b_node.gradient.flatten(), w_node.gradient.flatten())))

Graph (Custom): [  90.74210564  -49.93176266  -40.81034298  575.77571779 -296.43786207
 -279.33785572  259.1622831  -138.30173991 -120.86054318  463.91529332
 -212.79977319 -251.11552013  160.15901713  -66.24346762  -93.91554951]


In [14]:
print('Tensorflow:', np.hstack((gradients[0].numpy().flatten(), gradients[1].numpy().flatten())))

Tensorflow: [  90.74210564  -49.93176266  -40.81034298  575.77571779 -296.43786207
 -279.33785572  259.1622831  -138.30173991 -120.86054318  463.91529332
 -212.79977319 -251.11552013  160.15901713  -66.24346762  -93.91554951]
