In [1]:
import compgraph as cg
from autodiff.reverse import gradient, check_gradient, visualize_AD

import warnings

%matplotlib inline

warnings.filterwarnings('ignore')

# Visualizing Reverse AD
## Single Path to a Variable

$$
f(x) = \sin(2\ln x)
$$

In [None]:
x = cg.variable(2, name='x')
ln = cg.log(x, name='ln')
mul = cg.constant(2, name='2') * ln
f = cg.sin(mul, name='sin')
print(f)

In [None]:
x = cg.variable(2, name='x')
y = 3 * x
f = x * y
g = cg.log(y)
z = y + f + g
print(z)
print(gradient(z))

visualize_AD(z, figsize=(11, 8))

In [None]:
x = cg.variable(2, name='x')
y = cg.log(x)
f = x * y
g = cg.log(y)
z = y + f + g
print(z)
print(gradient(z))

visualize_AD(z, figsize=(11, 8))

In [None]:
x = cg.variable(4, name='x')
two = cg.constant(2, name='2')
f = x ** two + two ** x
f.name = 'add'

visualize_AD(f, figsize=(11, 8))

## Multi-variable Functions

$$
f(x,y,z) = \sin(x+y) + (xy)^z
$$

In [None]:
# For a function f:Rn→R, it takes only one application of reverse mode AD to compute the entire gradient
x = cg.variable(1, name='x')
y = cg.variable(2, name='y')
z = cg.variable(3, name='z')

add_0 = x + y
add_0.name = 'add_0'
mul = x*y
mul.name = 'mul'
powr = mul ** z
powr.name = 'pow'
sin = cg.sin(add_0)
sin.name='sin'
add_1 = sin + powr
add_1.name='add_1'

f2 = add_1

visualize_AD(f2, figsize=(11, 8))

# Gradient Checking

$$
f(x, y, z) = \sin\left(x^{y + z}\right) - 3z\ln\left(x^2y^3\right)
$$

In [8]:
def func(x,y,z):
    _x = cg.variable(x, 'x')
    _y = cg.variable(y, 'y')
    _z = cg.variable(z, 'z')

    return  cg.sin(_x ** (_y + _z)) - 3 * _z * cg.log((_x ** 2) * (_y ** 3))

f = func(0.5, 4, -2.3)
print(f)
g = gradient(f)

print("Gradient Checking Result: {}".format(check_gradient(func, [0.5, 4, -2.3], [g[v] for v in ['x', 'y', 'z']])))
print("")
print("∂f/∂x = {}".format(g['x']))
print("∂f/∂y = {}".format(g['y']))
print("∂f/∂z = {}".format(g['z']))

19.433811705909566
Gradient Checking Result: True

∂f/∂x = 28.59729544270365
∂f/∂y = 4.971684551677847
∂f/∂z = -8.521081615041496


# Simple Neural Network on MNIST Dataset

## Loading and Preprocessing the Data

In [9]:
from sklearn.utils import shuffle
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import numpy as np

X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
label_binarizer = LabelBinarizer()

# transforming all geryscale values to range [0,1]
# 0 being black and 1 beiung white 
X_scaled = X / 255

for i in range(3):
    print(y[i])
# transfrom categorical target labels into one-vs-all fashion
y_binarized = label_binarizer.fit_transform(y)
for i in range(3):
    print(y_binarized[i])

# splitting the data to 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binarized, test_size=0.2, random_state=42)

5
0
4
[0 0 0 0 0 1 0 0 0 0]
[1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 0 0]


## Defining and Running the NN

In [10]:
from tqdm import tqdm, trange

import compgraph as cg
from autodiff.reverse import gradient

LEARNING_RATE = 0.01
learning_rate = 0.01
BATCH_SIZE = 32
ITERATIONS = 50000

def relu(x):
    return cg.where(x > 0, x, 0)

# He Initialization
l1_weights = cg.variable(np.random.normal(scale=np.sqrt(2./784), size=(784, 64)), name='l1_w')
l1_bias = cg.variable(np.zeros(64), name='l1_b')
l2_weights = cg.variable(np.random.normal(scale=np.sqrt(2./64), size=(64, 10)), name='l2_w')
l2_bias = cg.variable(np.zeros(10), name='l2_b')


def nn(x):
    l1_activations = relu(cg.dot(x, l1_weights) + l1_bias)
    l2_activations = cg.dot(l1_activations, l2_weights) + l2_bias
    
    return l2_activations

last1000_losses = []
progress_bar = trange(ITERATIONS)
training_set_pointer = 0

for i in progress_bar:
    batch_x = X_train[training_set_pointer:training_set_pointer + BATCH_SIZE]
    batch_y = y_train[training_set_pointer:training_set_pointer + BATCH_SIZE]
    
    if training_set_pointer + BATCH_SIZE >= len(y_train):
        # if the training set is consumed, start from the beginning
        training_set_pointer = 0
    else:
        training_set_pointer += BATCH_SIZE
    
    logits = nn(batch_x)
    loss = cg.softmax_cross_entropy(logits, batch_y)
    last1000_losses.append(loss)
    
    progress_bar.set_description(
        "Avg. Loss (Last 1k Iterations): {:.5f}".format(np.mean(last1000_losses))
    )
    
    if len(last1000_losses) == 1000:
        last1000_losses.pop(0)
    
    grads = gradient(loss)
    
    l1_weights -= learning_rate * grads['l1_w']
    l2_weights -= learning_rate * grads['l2_w']
    l1_bias -= learning_rate * grads['l1_b']
    l2_bias -= learning_rate * grads['l2_b']

Avg. Loss (Last 1k Iterations): 0.00007: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [03:32<00:00, 235.63it/s]


## Tesing the NN's Accuracy on Validation Data

In [12]:
def softmax(x, axis):
    x_max = cg.max(x, axis=axis, keepdims=True)
    exp_op = cg.exp(x - x_max)
    return exp_op/ cg.sum(exp_op, axis=axis, keepdims=True)

logits = nn(X_test)
probabilities = softmax(logits, axis=-1)
predicted_labels = np.argmax(probabilities, axis=-1)
true_labels = np.argmax(y_test, axis=-1)
accuracy = np.mean(predicted_labels == true_labels)

print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 97.53%
