In [1]:
import numpy as np
import theano.tensor as T
from theano import function

### Adding two scalar

In [2]:
x = T.dscalar('x') # scalar variable symbol np.array
y = T.dscalar('y') # scalar variable symbol np.array
z = x + y
f = function([x,y], z) # define func, z as output

In [3]:
f(2,3)

array(5.0)

In [4]:
np.allclose(f(16.3,12.1), 28.4)

True

In [5]:
from theano import pp
print (pp(z))

(x + y)


In [6]:
# alternatively function to use `eval`

np.allclose(z.eval({x:16.3, y:12.1}), 28.4)

True

### Adding two matrices

In [7]:
x = T.dmatrix('x')
y = T.dmatrix('y')
z = x + y
f = function([x,y],z)

In [8]:
f([[1, 2], [3, 4]], [[10, 20], [30, 40]])

array([[ 11.,  22.],
       [ 33.,  44.]])

### Logistic function

In [10]:
import theano
x = T.dmatrix('x')
s = 1 / (1 + T.exp(-x))
logistic = function([x], s)
logistic([[0, 1], [-1, -2]]) # element wise

array([[ 0.5       ,  0.73105858],
       [ 0.26894142,  0.11920292]])

In [12]:
# verifying element wise
s2 = (1 + T.tanh(x / 2)) / 2
logistic2 = function([x], s2)
logistic2([[0, 1], [-1, -2]])

array([[ 0.5       ,  0.73105858],
       [ 0.26894142,  0.11920292]])

In [13]:
# Computing More than one Thing at the Same Time

a, b = T.dmatrices('a', 'b')
diff = a - b
abs_diff = abs(diff)
diff_squared = diff**2
f = function([a, b], [diff, abs_diff, diff_squared])

In [14]:
f([[1, 1], [1, 1]], [[0, 1], [2, 3]])

[array([[ 1.,  0.],
        [-1., -2.]]), array([[ 1.,  0.],
        [ 1.,  2.]]), array([[ 1.,  0.],
        [ 1.,  4.]])]

In [15]:
from theano import In
# In to specify properties of your function's parameter

x, y = T.dscalars('x', 'y')
z = x + y
f = function([x, In(y, value=1)], z)
# if y not given, then set it default to 1
f(33)

array(34.0)

In [16]:
f(33, 2) # if y given

array(35.0)

In [17]:
x, y, w = T.dscalars('x', 'y', 'w')
z = (x + y) * w
f = function([x, In(y, value=1), In(w, value=2, name='w_by_name')], z)

In [18]:
f(33)

array(68.0)

In [19]:
f(33, w_by_name=1)

array(34.0)

### using shared variables

In [20]:
from theano import shared
state = shared(0) # initialized to zero
inc = T.iscalar('inc')
accumulator = function([inc], state, updates=[(state, state+inc)])
# state incremented

In [21]:
# shared var value can be access by .get_value() and .set_value()

print state.get_value()

0


In [22]:
accumulator(1)

array(0)

In [23]:
print state.get_value()

1


In [24]:
accumulator(300)

array(1)

In [25]:
print state.get_value()

301


### Using random numbers

In [26]:
from theano.tensor.shared_randomstreams import RandomStreams

srng = RandomStreams(seed=234)
rv_u = srng.uniform((2,2)) # random stream of 2x2 matrices from uniform
rv_n = srng.normal((2,2)) # random stream of 2x2 matrices from normal
f = function([], rv_u)
g = function([], rv_n, no_default_updates=True)    
#Not updating rv_n.rng
# random number generator state is not affected 
nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)

In [27]:
f_val0 = f()
f_val1 = f()  #different numbers from f_val0

In [28]:
g_val0 = g()  # different numbers from f_val0 and f_val1
g_val1 = g()  # same numbers as g_val0!

In [29]:
from __future__ import print_function
from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.tensor.shared_randomstreams import RandomStreams

### A Real Example: Logistic Regression

In [32]:

rng = np.random

N = 400                                   # training sample size
feats = 784                               # number of input variables

# generate a dataset: D = (input_values, target_class)
D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
training_steps = 10000

# Declare Theano symbolic variables
x = T.matrix("x")
y = T.vector("y")

# initialize the weight vector w randomly
#
# this and the following bias variable b
# are shared so they keep their values
# between training iterations (updates)
w = theano.shared(rng.randn(feats), name="w")

# initialize the bias term
b = theano.shared(0., name="b")

#print("Initial model:")
#print(w.get_value())
print(b.get_value())

0.0


In [34]:
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))   # Probability that target = 1
prediction = p_1 > 0.5                    # The prediction thresholded
xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
# w.r.t weight vector w and bias term b
# (we shall return to this in a following section of this tutorial)

# Compile
train = theano.function(
          inputs=[x,y],
          outputs=[prediction, xent],
          updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
predict = theano.function(inputs=[x], outputs=prediction)

# Train
for i in range(training_steps):
    pred, err = train(D[0], D[1])

print("Final model:")
#print(w.get_value())
print(b.get_value())
print("target values for D:")
#print(D[1])
print("prediction on D:")
#print(predict(D[0]))

Final model:
0.184729948222
target values for D:
prediction on D:


## Derivatives in Theano

### Computing gradients

In [35]:
x = T.dscalar('x')
y = x ** 2
gy = T.grad(y, x) # compute gradient of x^2
pp(gy)  # print out the gradient prior to optimization

'((fill((x ** TensorConstant{2}), TensorConstant{1.0}) * TensorConstant{2}) * (x ** (TensorConstant{2} - TensorConstant{1})))'

In [36]:
f = theano.function([x], gy)
f(4)

array(8.0)

In [37]:
np.allclose(f(94.2), 188.4)

True

In [38]:
x = T.dmatrix('x')
s = T.sum(1 / (1 + T.exp(-x))) 
gs = T.grad(s, x)
dlogistic = function([x], gs)
dlogistic([[0, 1], [-1, -2]])

array([[ 0.25      ,  0.19661193],
       [ 0.19661193,  0.10499359]])

### computing jacobian

In [39]:
x = T.dvector('x')
y = x ** 2
J, updates = theano.scan(lambda i,y,x : T.grad(y[i], x), 
                         sequences=T.arange(y.shape[0]), 
                         non_sequences=[y,x])
# manually compute the Jacobian of some function y wrt some x 
# we need to use scan. 
# What we do is to loop over the entries in y and 
# compute the gradient of y[i] with respect to x.

f = theano.function([x], J, updates=updates)
f([4, 4])

array([[ 8.,  0.],
       [ 0.,  8.]])

### computing the Hessian

In [40]:
x = T.dvector('x')
y = x ** 2
cost = y.sum()
gy = T.grad(cost, x)
H, updates = theano.scan(lambda i, gy,x : T.grad(gy[i], x), 
                         sequences=T.arange(gy.shape[0]), 
                         non_sequences=[gy, x])
f = theano.function([x], H, updates=updates)
f([4, 4])

array([[ 2.,  0.],
       [ 0.,  2.]])

## Jacobian times a vector

### R-operator $\frac{\partial f(x)}{\partial x}v$

In [41]:
W = T.dmatrix('W')
V = T.dmatrix('V')
x = T.dvector('x')
y = T.dot(x, W)
JV = T.Rop(y, W, V) #?
f = theano.function([W, V, x], JV)
f([[1, 1], [1, 1]], [[2, 2], [2, 2]], [0,1])

array([ 2.,  2.])

### L-operator $v\frac{\partial f(x)}{\partial x}$

In [44]:
W = T.dmatrix('W')
>>> v = T.dvector('v')
>>> x = T.dvector('x')
>>> y = T.dot(x, W)
>>> VJ = T.Lop(y, W, v)
>>> f = theano.function([v,x], VJ)
>>> f([2, 2], [0, 1])

array([[ 0.,  0.],
       [ 2.,  2.]])

In [46]:
# Hessian times a vector similar

x = T.dvector('x')
v = T.dvector('v')
y = T.sum(x ** 2)
gy = T.grad(y, x)
vH = T.grad(T.sum(gy * v), x)
f = theano.function([x, v], vH)
f([4, 4], [2, 2])

array([ 4.,  4.])

In [47]:
>>> x = T.dvector('x')
>>> v = T.dvector('v')
>>> y = T.sum(x ** 2)
>>> gy = T.grad(y, x)
>>> Hv = T.Rop(gy, x, v) # or Rop
>>> f = theano.function([x, v], Hv)
>>> f([4, 4], [2, 2])

array([ 4.,  4.])

* `grad` works symbolically
* arrays handles through repeated application
* built-in efficiently compute *vector times Jacobian/Hessian*

## Conditions

### IfElse vs Switch

* both over symbolic var
* `IfElse`: boolean condition, 2 vars input, lazy
* `Switch`: tensor condition, 2 vars input, element wise operation

In [48]:
# example

from theano import tensor as T
from theano.ifelse import ifelse
import theano, time, numpy

a,b = T.scalars('a', 'b')
x,y = T.matrices('x', 'y')

# switch(cond, iftrue, iffalse)
z_switch = T.switch(T.lt(a, b), T.mean(x), T.mean(y)) # lt = less than
z_lazy = ifelse(T.lt(a, b), T.mean(x), T.mean(y))

f_switch = theano.function([a, b, x, y], z_switch,
                           mode=theano.Mode(linker='vm'))
# linker='vm' let IfElse compute both vars
f_lazyifelse = theano.function([a, b, x, y], z_lazy,
                               mode=theano.Mode(linker='vm'))

val1 = 0.
val2 = 1.
big_mat1 = numpy.ones((10000, 1000))
big_mat2 = numpy.ones((10000, 1000))

n_times = 10

tic = time.clock()
for i in range(n_times):
    f_switch(val1, val2, big_mat1, big_mat2)
print('time spent evaluating both values %f sec' % (time.clock() - tic))

tic = time.clock()
for i in range(n_times):
    f_lazyifelse(val1, val2, big_mat1, big_mat2)
print('time spent evaluating one value %f sec' % (time.clock() - tic))

time spent evaluating both values 0.220927 sec
time spent evaluating one value 0.108151 sec


## Loop

### scan
* `scan` along input sequence of some funcs
* slightly faster than `for` loop
* lower overall memory usage

In [49]:
# Computing tanh(x(t).dot(W) + b) elementwise

import theano
import theano.tensor as T
import numpy as np

# defining the tensor variables
X = T.matrix("X")
W = T.matrix("W")
b_sym = T.vector("b_sym")

#scan over X, input
results, updates = theano.scan(lambda v: T.tanh(T.dot(v, W) + b_sym), 
                               sequences=X)
compute_elementwise = theano.function(inputs=[X, W, b_sym], 
                                      outputs=results)

# test values
x = np.eye(2, dtype=theano.config.floatX)
w = np.ones((2, 2), dtype=theano.config.floatX)
b = np.ones((2), dtype=theano.config.floatX)
b[1] = 2

print(compute_elementwise(x, w, b))

# comparison with numpy
print(np.tanh(x.dot(w) + b))

[[ 0.96402758  0.99505475]
 [ 0.96402758  0.99505475]]
[[ 0.96402758  0.99505475]
 [ 0.96402758  0.99505475]]


In [50]:
# Computing the sequence x(t) = tanh(x(t - 1).dot(W) + y(t).dot(U) + p(T - t).dot(V))

import theano
import theano.tensor as T
import numpy as np

# define tensor variables
X = T.vector("X")
W = T.matrix("W")
b_sym = T.vector("b_sym")
U = T.matrix("U")
Y = T.matrix("Y")
V = T.matrix("V")
P = T.matrix("P")

# scan Y?
results, updates = theano.scan(lambda y, p, x_tm1: 
                               T.tanh(T.dot(x_tm1, W) + T.dot(y, U) + T.dot(p, V)),
                               sequences=[Y, P[::-1]], outputs_info=[X])
compute_seq = theano.function(inputs=[X, W, Y, U, P, V], outputs=results)

# test values
x = np.zeros((2), dtype=theano.config.floatX)
x[1] = 1
w = np.ones((2, 2), dtype=theano.config.floatX)
y = np.ones((5, 2), dtype=theano.config.floatX)
y[0, :] = -3
u = np.ones((2, 2), dtype=theano.config.floatX)
p = np.ones((5, 2), dtype=theano.config.floatX)
p[0, :] = 3
v = np.ones((2, 2), dtype=theano.config.floatX)

print(compute_seq(x, w, y, u, p, v))

# comparison with numpy
x_res = np.zeros((5, 2), dtype=theano.config.floatX)
x_res[0] = np.tanh(x.dot(w) + y[0].dot(u) + p[4].dot(v))
for i in range(1, 5):
    x_res[i] = np.tanh(x_res[i - 1].dot(w) + y[i].dot(u) + p[4-i].dot(v))
print(x_res)

[[-0.99505475 -0.99505475]
 [ 0.96471973  0.96471973]
 [ 0.99998585  0.99998585]
 [ 0.99998771  0.99998771]
 [ 1.          1.        ]]
[[-0.99505475 -0.99505475]
 [ 0.96471973  0.96471973]
 [ 0.99998585  0.99998585]
 [ 0.99998771  0.99998771]
 [ 1.          1.        ]]


### specifying exact shape

In [51]:
import theano
>>> x = theano.tensor.matrix()
>>> x_specify_shape = theano.tensor.specify_shape(x, (2, 2))
>>> f = theano.function([x], (x_specify_shape ** 2).shape)
>>> theano.printing.debugprint(f)

DeepCopyOp [id A] ''   0
 |TensorConstant{(2,) of 2} [id B]
