In [1]:
# Material taken directly from theano tutorial

In [1]:
# All imports - run this first
import numpy as np
import theano.tensor as T
import theano
from theano import function, Param, shared
from theano.tensor.shared_randomstreams import RandomStreams
from theano.sandbox.rng_mrg import MRG_RandomStreams

# Useful functions
def floatX(z):
    return np.asarray(z,dtype=theano.config.floatX)

Using gpu device 0: GeForce GTX 580


In [15]:
# Adding two scalars
x = T.dscalar('x')
y = T.dscalar('y')
z = x + y
f = function([x,y],z)
f(2,3)

array(5.0)

In [16]:
# Logistic function (for matrices only)
X = T.dmatrix('X')
s = 1/(1+T.exp(-X))
logit = function([X],s)
logit([[1.2,-0.3],[3.4,1.1]])

array([[ 0.76852478,  0.42555748],
       [ 0.96770454,  0.75026011]])

In [5]:
# Using default parameters
x,y = T.dscalars('x','y')
z = x + y
f = function([x,Param(y,default=3)],z)
f(2) # since no 'y' parameter was specified, default of 3 will be used

array(5.0)

In [6]:
# Using shared variables
state = shared(0)
inc = T.iscalar('inc') # integer
acc = function([inc],state,updates = [(state,state+inc)]) # 'updates' parameter takes a (shared-variable, new expr) argument

In [7]:
# Testing functionality
state.get_value()

array(0)

In [8]:
acc(1)
state.get_value()

array(1)

In [9]:
acc(300)
state.get_value()

array(301)

In [11]:
# can define any number of functions that operate on 'state'
dec = function([inc],state,updates=[(state,state-inc)])
dec(5)
state.get_value()

array(291)

In [12]:
# The "givens" parameter of function
fn_of_state = state*2 + inc

# The type of 'foo' must match the shared variable we are replacing with the 'givens'
foo = T.scalar(dtype = state.dtype)
skip_shared = function([inc,foo],fn_of_state,givens=[(state,foo)])
skip_shared(1,3)

array(7)

In [13]:
state.get_value()

array(291)

In [17]:
# Using random streams

srng = RandomStreams(seed=234)
rv_u = srng.uniform((2,2))
rv_n = srng.normal((2,2))
f = function([], rv_u)
g = function([], rv_n, no_default_updates=True) # not updating rv_n.rng
nearly_zeros = function([], rv_u + rv_u - 2*rv_u)

In [19]:
f()

array([[ 0.31971415,  0.47584377],
       [ 0.24129163,  0.42046081]])

In [20]:
f() # should be different from the last call

array([[ 0.44078224,  0.26993381],
       [ 0.14317277,  0.43571539]])

In [21]:
g()

array([[ 0.37328447, -0.65746672],
       [-0.36302373, -0.97484625]])

In [22]:
g() # should be the same as before, since we have "no_default_updates" set to True

array([[ 0.37328447, -0.65746672],
       [-0.36302373, -0.97484625]])

In [23]:
nearly_zeros() # despite using 'rv_u' three times, it's the same value for each occurrence

array([[ 0.,  0.],
       [ 0.,  0.]])

In [26]:
# Copying random state between theano graphs

class Graph():
    def __init__(self,seed=123):
        self.rng = RandomStreams(seed)
        self.y = self.rng.uniform(size=(1,))

g1 = Graph(seed=123)
f1 = function([],g1.y)

g2 = Graph(seed=987)
f2 = function([],g2.y)

print "By default, the two functions are out of sync."
print "f1() returns",f1()
print "f2() returns",f2()

def copy_random_state(g1,g2):
    if isinstance(g1.rng, MRG_RandomStreams):
        g2.rng.rstate = g1.rng.rstate
    for (su1,su2) in zip(g1.rng.state_updates, g2.rng.state_updates):
        su2[0].set_value(su1[0].get_value())

print "We now copy the state of the theano random number generators."
copy_random_state(g1,g2)
print "f1() returns",f1()
print "f2() retruns",f2()        

By default, the two functions are out of sync.
f1() returns [ 0.72803009]
f2() returns [ 0.55056769]
We now copy the state of the theano random number generators.
f1() returns [ 0.59044123]
f2() retruns [ 0.59044123]


In [37]:
# Matrix operations - summing values across rows in a matrix
Z = T.dmatrix('Z')
f = function([Z],T.sum(Z,axis=0))
f([[1.2,0.3],[3.3,5.1]])

array([ 4.5,  5.4])

In [38]:
# Matrix operations - summing all values in a matrix
Z = T.dmatrix('Z')
f = function([Z],T.sum(Z))
f([[1.2,0.3],[3.3,5.1]])

array(9.899999999999999)

In [5]:
# Softmax of a matrix k x m

Z = T.dmatrix("Z")
max_v = T.max(Z,axis=0)
log_sum = T.log(T.sum(T.exp(Z-max_v),axis=0))+max_v
softmax_fn = T.exp(Z-log_sum)
softmax = function([Z],softmax_fn)
softmax(np.random.randn(4,2))

array([[ 0.18832867,  0.04545864],
       [ 0.2801004 ,  0.25171751],
       [ 0.12795057,  0.31715648],
       [ 0.40362036,  0.38566736]])

In [5]:
# Broadcasting
A = T.dmatrix('A')
b = T.vector('b')
C = A + b
f = function([A,b],C)
f(np.array([[1,2],[2,3]]),np.array([1,2]))

array([[ 2.,  4.],
       [ 3.,  5.]])

In [6]:
# Broadcasting, Part 2

W = floatX(np.random.rand(10,5))
b = floatX(np.random.rand(10,1))
x = T.matrix('x')
z = T.dot(W,x) + b
f = function([x],z)
X = floatX(np.random.rand(5,2))
f(X)

array([[ 1.09562664,  1.11004694],
       [ 1.11954532,  1.48119477],
       [ 1.61607887,  2.23158286],
       [ 0.83928055,  1.31128506],
       [ 1.73383586,  2.62231855],
       [ 2.01856293,  2.22257043],
       [ 1.71298641,  2.11270827],
       [ 2.06697041,  2.29986275],
       [ 1.401576  ,  1.88210006],
       [ 0.95474975,  0.91113089]])

In [14]:
y1 = floatX(np.random.rand(5,5))
y2 = floatX(np.random.rand(5,5))
a = T.matrix()
b = T.matrix()
f = function([a],T.argmax(a,axis=1))

<type 'numpy.ndarray'>


In [39]:
# Flattening and concatenating
w = shared(floatX(np.random.rand(4,3)))
b = shared(floatX(np.random.rand(2,6)))
#wv = w.get_value().flatten()
v = floatX(np.array([]))
w_flat = T.concatenate((v,T.flatten(w)))
x = T.matrix()
f_flatten = theano.function(inputs=[x],outputs=T.flatten(x)+w_flat)
#f_flatten = theano.function(inputs=[x,y],outputs=T.concatenate((T.flatten(x),T.flatten(y))))
print f_flatten(b.get_value())

[ 0.36394914  1.18667072  0.59008706  1.65606705  0.91737431  1.02576937
  1.72797192  0.97047948  0.90174296  1.24189907  1.50950989  0.74876958]


In [30]:
# Hypothesis: functions written using numpy functions are equivalent to those using theano

v = np.random.rand(5,5)
w = np.random.rand(5,5)

def some_function(a,b):
    try:
        print 'Trying to use a numpy method within a theano function...'
        return np.dot(a,b)
    except ValueError:
        print '...raises a ValueError. Compiling theano function...'
        return T.dot(a,b)

x = T.matrix()
y = T.matrix()
f = theano.function(inputs=[x,y],outputs=some_function(x,y))
print '...and all seems well!\n'

print f(v,w)

# Conclusion: you can't just use numpy methods en route to constructing an expression graph in theano. While the data structures
# themselves can be numpy based, the methods applied to them prior to compiling the theano 'function' MUST be from theano!

Trying to use a numpy method within a theano function...
...raises a ValueError. Compiling theano function...
...and all seems well!

[[ 0.65775892  1.54983853  1.2175288   1.70021737  1.4490416 ]
 [ 0.73635439  1.89111086  1.10033952  1.97005004  1.44693019]
 [ 0.35617496  1.4472465   1.11938475  1.42085344  1.00723842]
 [ 0.63104755  1.99118292  1.18530693  2.04810242  1.34770615]
 [ 0.52160606  1.12934379  0.63217456  1.20370576  0.95544632]]


In [57]:
srng = RandomStreams()
f = theano.function(inputs=[],outputs=srng.permutation((1,),10))
f()

array([[8, 7, 2, 4, 3, 1, 5, 0, 9, 6]])

In [21]:
# multiply
X = T.matrix('X')
f = theano.function(inputs=[X],outputs=T.sum(X))
f(np.ones((5,5)))

array(25.0)

In [35]:
# Testing theories about the theano graph using two versions of "fprop"
#----------------------------------------------------------------------
def reLU(z):
    return 0.5*(z + abs(z))

# this function keeps track of all the intermediate values..
def fprop(X,wts,bs):
    act = []
    act.append(reLU(T.dot(X,wts[0]) + bs[0]))
    if len(wts) > 1:
        for i,(w,b) in enumerate(zip(wts[1:],bs[1:])):
            act.append(reLU(T.dot(act[-1],w) + b))
    return act[-1]

# ...while this one just computes the final activation, renaming the same variable
def fprop_v2(X,wts,bs):
    act = reLU(T.dot(X,wts[0]) + bs[0])
    if len(wts) > 1:
        for i,(w,b) in enumerate(zip(wts[1:],bs[1:])):
            act = reLU(T.dot(act,w) + b)
    return act

def cross_entropy(y,y_prob):
    return T.mean(T.sum(-1.0*y*T.log(y_prob),axis=1))

X = T.matrix('X')
y = T.matrix('y')

# define model parameters
wts = [theano.shared(floatX(np.random.rand(5,3))), theano.shared(floatX(np.random.rand(3,2)))]
bs =[theano.shared(floatX(np.random.rand(3,))),theano.shared(floatX(np.random.rand(2,)))]
wts2 = [theano.shared(floatX(w.get_value())) for w in wts]
bs2 =[theano.shared(floatX(b.get_value())) for b in bs]

params = [p for param in [wts,bs] for p in param]
params2 = [p for param in [wts2,bs2] for p in param]

# run fprop
act = fprop(X,wts,bs)
cost = cross_entropy(y,act)
act2 = fprop_v2(X,wts2,bs2)
cost2 = cross_entropy(y,act2)
grads = [T.grad(cost,param) for param in params]
grads2 = [T.grad(cost2,param2) for param2 in params2]

learn_rate = 0.01

# updates
updates = []
for param,grad in zip(params,grads):
    updates.append((param,param-learn_rate*grad))

    updates2 = []
for param2,grad2 in zip(params2,grads2):
    updates2.append((param2,param2-learn_rate2*grad2))
    
f = theano.function(inputs=[X,y],updates=updates)
f2 = theano.function(inputs=[X,y],updates=updates2)

X = np.random.rand(128,5)
y = np.random.rand(128,2)

for i in range(10):
    f(X,y)
    f2(X,y)

print 'First weight matrix',wts[0].get_value()
print 'First weight matrix 2',wts2[0].get_value()
print 'Second weight matrix',wts[1].get_value()
print 'Second weight matrix 2',wts2[1].get_value()
print 'First bias vector',bs[0].get_value()
print 'First bias vector 2',bs2[0].get_value()
print 'Second bias vector',bs[1].get_value()
print 'Second bias vector 2',bs2[1].get_value()

# Conclusion - it doesn't matter how you do it! apparently you can re-use variables like in fprop2, but theano is smart enough to 
# know that how to use the intermediate values to compute the gradient correctly

First weight matrix [[ 0.49290292  0.86104279  0.72448243]
 [ 0.9216381   0.87832127  0.90038202]
 [ 0.57868246  0.51204569  0.06803047]
 [ 0.05886483  0.57402348  0.98231581]
 [ 0.22575257  0.83450836  0.49191782]]
First weight matrix 2 [[ 0.4991539   0.86575163  0.72952199]
 [ 0.92874495  0.88335227  0.90574462]
 [ 0.5852498   0.51700593  0.07333998]
 [ 0.06580044  0.57895633  0.98757538]
 [ 0.2323965   0.83949635  0.49725499]]
Second weight matrix [[ 0.08045335  0.94660693]
 [ 0.32219125  0.25755443]
 [ 0.36880502  0.24883897]]
Second weight matrix 2 [[ 0.12262823  0.97303686]
 [ 0.37519848  0.29068712]
 [ 0.40800009  0.27375599]]
First bias vector [ 0.97551361  0.85460113  0.42164453]
First bias vector 2 [ 0.98962078  0.86494958  0.43270087]
Second bias vector [ 0.76316393  0.68158803]
Second bias vector 2 [ 0.78352474  0.69433626]


In [9]:
# gaussian corruption
srng = RandomStreams(seed=234)
X = T.matrix('X')
X_in = np.random.rand(3,3)
W = srng.normal(X.shape,avg=0.0,std=0.1,dtype=theano.config.floatX)
f = theano.function(inputs=[X],outputs=[X*W],allow_input_downcast=True)

print X_in
print f(X_in)


[[ 0.11628806  0.86925939  0.82625485]
 [ 0.50952383  0.9265722   0.55308302]
 [ 0.21753194  0.99640684  0.00409901]]
[array([[ -3.10584973e-03,  -1.73271507e-01,  -3.80449258e-02],
       [ -7.63091370e-02,  -1.54815465e-01,  -2.37861723e-02],
       [ -4.37300233e-03,  -1.11184537e-01,   1.13230250e-04]], dtype=float32)]


In [6]:
# salt-and-pepper noise
srng = RandomStreams(seed=234)
X = T.imatrix('X')
p = srng.binomial(X.shape,n=1,p=0.2,dtype='int32')
X_in = np.array([[1,0,1],[0,0,0],[1,0,0]])
fr = theano.function(inputs=[X],outputs=[p,T.bitwise_xor(X,p)],allow_input_downcast=True)

print 'X = ',X_in
print 'p, f(X,p) = ',fr(X_in)

X =  [[1 0 1]
 [0 0 0]
 [1 0 0]]
p, f(X,p) =  [array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 0]], dtype=int32), array([[1, 1, 1],
       [1, 0, 0],
       [1, 0, 0]], dtype=int32)]


In [6]:
# dimshuffle(0,'x') is the theano equivalent to [:,np.newaxis]
X = T.matrix('X')
f = theano.function(inputs=[X],outputs=X/T.sum(X**2,axis=1).dimshuffle(0,'x'),allow_input_downcast=True)
W = np.random.randn(10,5)
print f(W)

[[-0.11006484 -0.6218524   0.23201133  0.48564717  0.0775805 ]
 [-0.10214885 -0.09367319  0.36326545 -0.33506241  0.2087519 ]
 [ 0.16348355  0.20904741 -0.22144511 -0.08653351  0.29669082]
 [ 0.39871928  0.15851337 -0.07745841  0.08962058 -0.12501797]
 [ 0.24523439  0.09288102 -0.12199248  0.44029674  0.36417735]
 [ 0.14614885  0.0818184  -0.3689642   0.20411411 -0.26736882]
 [-0.26809549  0.07385527  0.24456078 -0.30386543  0.24661809]
 [ 0.10035934  0.16503571 -0.48407948  0.4038437   0.29841065]
 [ 0.0428631  -0.08297379 -0.05714754  0.08651479 -0.38258266]
 [-0.4535656  -0.07160203  0.23475783 -0.28408083  0.1543303 ]]


In [4]:
# Sparsity term

act = np.abs(np.random.rand(100,50))
X = T.matrix()
beta = 0.5
rho = 0.01
avg_act = T.mean(X,axis=0)
avg_act_np = np.mean(act,axis=0)
sparse_loss = beta*T.sum(rho*T.log(rho/avg_act) + (1-rho)*T.log((1-rho)/(1-avg_act)))
sparse_term = theano.function(inputs=[X],outputs=[sparse_loss],mode='FAST_RUN',allow_input_downcast=True)
print sparse_term(act)
sparse_term_np =beta*np.sum(rho*np.log(rho/avg_act_np) + (1-rho)*np.log((1-rho)/(1-avg_act_np)))
print sparse_term_np


[array(16.057996924982266)]
16.057996925


In [6]:
# Using tensor.clip
X = T.vector()
clip_value = theano.function(inputs=[X],outputs=[T.clip(X,1e-5,1-1e-5)],allow_input_downcast=True)
# A = np.asarray([0,0.5,1],dtype='float32')
A = np.array([0,0.5,1])
B = clip_value(A)
print B

[array([  9.99999975e-06,   5.00000000e-01,   9.99989986e-01], dtype=float32)]
