In [1]:
import numpy as np
try:
    from numbapro import autojit
except ImportError:
    # fallback to numba
    from numba import autojit
from cs231n.bench import init_benchmark_data, init_benchmark_model

In [2]:
bench_params = dict(
    input_size=1000,
    hidden_size=30,
    num_classes=10,
    num_inputs=500,
)

model = init_benchmark_model(rand_seed=9527, **bench_params)
X, y = init_benchmark_data(rand_seed=9527, **bench_params)

In [21]:
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']

In [4]:
dotprod1 = X.dot(W1) + b1
dotprod1.shape

(500, 30)

### ReLU

In [5]:
%%timeit
np.where(dotprod1 > 0, dotprod1, 0)

The slowest run took 7.70 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 26.1 µs per loop


In [13]:
%%timeit 
dotprod1 * (dotprod1 > 0)

10000 loops, best of 3: 34.1 µs per loop


### 1st Layer Forward

In [7]:
def bench_layer1_ff(X, W1, b1):
    dotprod1 = X.dot(W1) + b1
    return np.where(dotprod1 > 0, dotprod1, 0)

In [14]:
def bench_layer1_ff_alt(X, W1, b1):
    dotprod1 = X.dot(W1) + b1
    return dotprod1 * (dotprod1 > 0)

In [12]:
%%timeit
resp1 = bench_layer1_ff(X, W1, b1)

1000 loops, best of 3: 413 µs per loop


In [15]:
%%timeit
resp1 = bench_layer1_ff_alt(X, W1, b1)

1000 loops, best of 3: 437 µs per loop


In [9]:
@autojit
def bench_layer1_ff_jit(X, W1, b1):
    dotprod1 = X.dot(W1) + b1
    return np.where(dotprod1 > 0, dotprod1, 0)

In [11]:
%%timeit
resp1 = bench_layer1_ff_jit(X, W1, b1)

1000 loops, best of 3: 416 µs per loop


In [19]:
resp1 = bench_layer1_ff(X, W1, b1)

In [23]:
dotprod2 = resp1.dot(W2) + b2

In [24]:
dotprod2.shape

(500, 10)