# Introduction to Theano

In [1]:
import theano
import theano.tensor as T
import numpy as np
import pandas as pd
import numpy.random as rd

import tempfile
import pathlib
from IPython.display import display, Image

## Three kinds of sums

### Standard Python

In [2]:
a + b # doesn't work 

NameError: name 'a' is not defined

In [3]:
a = 1
b = 2
a + b

3

### Standard Numpy

In [4]:
a = np.array([1, 0.1, 100])
b = np.array([2, 0.2, 200])

a+b

array([  3. ,   0.3, 300. ])

### Theano

In [5]:
a = T.dscalar()  # scalar of type 'd': decimal/double
b = T.dscalar()

a + b

Elemwise{add,no_inplace}.0

Theano doesn't _immediately_ do computation. It builds up a program ("computation graph"). If you provide the input values, you can execute that program.

E.g. by having Theano make a function

In [6]:
f = theano.function(inputs=[a, b], outputs=a + b)

In [7]:
f(3, 4)

array(7.)

Or by directly calling `eval()`

In [8]:
(a + b).eval(inputs_to_values={a:3, b:4})

array(7.)

Note how this was slow the first time, then fast after that? 

That's because the first time we called this, Theano compiled this into code, and all subsequent times it reused that compiled code.

#### Theano Types

In [9]:
a

<TensorType(float64, scalar)>

In [10]:
T.iscalar()

<TensorType(int32, scalar)>

In [11]:
T.scalar()

<TensorType(float64, scalar)>

#### Full list of Theano types

* **byte**: bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4, btensor5, btensor6, btensor7
* **16-bit integers**: wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4, wtensor5, wtensor6, wtensor7
* **32-bit integers**: iscalar, ivector, imatrix, irow, icol, itensor3, itensor4, itensor5, itensor6, itensor7
* **64-bit integers**: lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4, ltensor5, ltensor6, ltensor7
* **float**: fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4, ftensor5, ftensor6, ftensor7
* **double**: dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4, dtensor5, dtensor6, dtensor7
* **complex**: cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4, ctensor5, ctensor6, ctensor7

#### Functions with Multiple Outputs

In [12]:
f = theano.function(inputs=[a, b],
                    outputs=[a + b, a * b, a / b])

In [13]:
f(3,4)

[array(7.), array(12.), array(0.75)]

#### Functions with default values and multiple outputs

Using the `theano.In()` class

In [14]:
f = theano.function(
        inputs=[
            a, 
            theano.In(b, value=3)], 
        outputs=[a + b, a * b, a / b]
)

In [15]:
f(3,4)

[array(7.), array(12.), array(0.75)]

In [16]:
f(3)

[array(6.), array(9.), array(1.)]

#### Pretty printing

In [17]:
theano.pp((a + b))

'(<TensorType(float64, scalar)> + <TensorType(float64, scalar)>)'

In [18]:
a = T.dscalar('a')
b = T.dscalar('b')

theano.pp((a + b))

'(a + b)'

We can also show the parse tree

In [19]:
theano.printing.debugprint(a+b)

Elemwise{add,no_inplace} [id A] ''   
 |a [id B]
 |b [id C]


We can further create a graph of the parse tree with `theano.printing.pydotprint`, though you need to use an intermediary file since ultimately this uses the `dot` program on the command-line.

In [20]:
_, fname = tempfile.mkstemp(suffix='.png', prefix='a_plus_b-')

theano.printing.pydotprint(a+b, 
                           outfile=fname, 
                           print_output_file=False)

display(Image(filename=fname))
pathlib.Path(fname).unlink()

TypeError: Node() takes no arguments

## So what about speed?

In [21]:
l = list(range(10_000_000))
r = list(range(10_000_000))

In [22]:
%time res = [l + r for l,r in zip(l,r)];

CPU times: user 791 ms, sys: 126 ms, total: 917 ms
Wall time: 916 ms


In [23]:
npl = np.array(l, dtype=np.int32).reshape((len(l), 1))
npr = np.array(r, dtype=np.int32).reshape((len(l), 1))

In [24]:
%timeit -r 5 -n 10  rs = npl + npr

28.3 ms ± 14.1 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


In [25]:
# Just to prove this is really getting compiled down to C....
theano.config.gcc.cxxflags = "-Wno-c++11-narrowing"

In [26]:
tl = T.dmatrix('l')
tr = T.dmatrix('r')

f = theano.function(inputs=[tl, tr], outputs=tl + tr)

In [27]:
%timeit -r 5 -n 10 res = f(npl, npr)

107 ms ± 55.2 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


# Derivatives with Theano

Since Theano is actually building up mathematical expressions internally, it can compute derivatives

For example, it can figure out that $\frac{d}{dx} x^2 = 2x$

In [28]:
x = T.dscalar('x')
(x ** 2).eval(inputs_to_values={x: 4})

array(16.)

In [29]:
theano.pp(T.grad(x ** 2, wrt=x))

'((fill((x ** TensorConstant{2}), TensorConstant{1.0}) * TensorConstant{2}) * (x ** (TensorConstant{2} - TensorConstant{1})))'

This is not "optimised", but does it work?

In [30]:
T.grad(x ** 2, wrt=x).eval(inputs_to_values={x: 4})

array(8.)

It works! If we create a function from the gradient, it gets optimised

In [31]:
f = theano.function(inputs=[x], outputs=T.grad(x ** 2, wrt=x))
theano.pp(f.maker.fgraph.outputs[0])

'(TensorConstant{2.0} * x)'

In fact, _every time_ you create a function, Theano will optimise it for you, not just at the machine-level, but also using mathematical simplifications where possible

#### A complex example
Can Theano tell use how to solve linear regression?

$$\boldsymbol{y} = X \boldsymbol{w}$$

The squared error function is

$$
\begin{align*}
SSE(w) & = \sum_i (y_i - \boldsymbol{w}^\top\boldsymbol{x}_i)^2 \\
 & = (\boldsymbol{y} - X\boldsymbol{w})^\top(\boldsymbol{y} - X\boldsymbol{w})
\end{align*}
$$

It's gradient with respect to $\boldsymbol{w}$ is:

$$
2X^\top X \boldsymbol{w} -2 X^\top y = 2X^\top(X \boldsymbol{w} - \boldsymbol{y})
$$

In [32]:
X = T.dmatrix('X')
w = T.dvector('w')

y = T.dvector('y')
two = T.constant(2)

errors = (y - T.dot(X, w))
sse = T.dot(errors, errors)

In [33]:
theano.pp(sse)

'((y - (X \\dot w)) \\dot (y - (X \\dot w)))'

In [34]:
gsse = theano.function(inputs=[X, y, w], outputs=T.grad(sse, wrt=w))
theano.pp(gsse.maker.fgraph.outputs[0])

"CGemv{inplace}(AllocEmpty{dtype='float64'}(Shape_i{1}(X)), TensorConstant{1.0}, X.T, Elemwise{Composite{(-(i0 + i0))}}[(0, 0)](CGemv{no_inplace}(y, TensorConstant{-1.0}, X, w, TensorConstant{1.0})), TensorConstant{0.0})"

CGemv is one of the BLAS core functions

$$
\text{CGEMV}(\alpha, A, \boldsymbol{x}, \beta, \boldsymbol{y}) = \alpha\cdot A\boldsymbol{x} + \beta\cdot\boldsymbol{y}
$$

Which means we'd expect to see 

$$
\begin{align*}
\nabla_\boldsymbol{w} \text{SSE}(\boldsymbol{w}) = 2X^\top(X \boldsymbol{w} - \boldsymbol{y}) & =2 X^\top \left(1 \cdot X \boldsymbol{w} + -1 \cdot \boldsymbol{y}\right) \\
 & = 2 \cdot X^\top \left(1 \cdot X \boldsymbol{w} + -1 \cdot \boldsymbol{y}\right) + 0 \cdot \boldsymbol{0}
\end{align*}
$$

In [35]:
_, fname = tempfile.mkstemp(suffix='.png', prefix='gsse-')

theano.printing.pydotprint(gsse, 
                           outfile=fname, 
                           print_output_file=False)

display(Image(filename=fname))
pathlib.Path(fname).unlink()

TypeError: Node() takes no arguments

Gradient descent states that you can find the value of _any_ function $f(\boldsymbol{x}; \boldsymbol{w})$ by starting with some value and then following the negative gradient until you hit a saddle-point

$$
\boldsymbol{w} \leftarrow \boldsymbol{w} - \eta \nabla_{\boldsymbol{w}} f(\boldsymbol{x}; \boldsymbol{w})
$$

where $\eta$ is the step-size. From this it should be clear that this facility allows people to write inference engines on top of Theano that can find the solutions for any function. PyMC is one such engine.

This is the first feature that differentiates Theano from the likes of Cython. Whereas Cython helps things run fast, Theano allows you to perform symbolic mathematical computations _and_ make them run fast.

But first a little more detail

# "Shared" (i.e. Mutable) Variables with Theano

Why create a shared variable? Because

 1. You want to share it between multiple functions
 2. You want to share it between multiple calls to the same function
 
(2) is particularly handy when you're doing an iterative, mutating-in-place, training scheme like gradient descent in a neural network

The fundamental characteristic of shared variables is that they are mutable

Compare these two:

In [36]:
foo = T.dscalar('foo')
bar = theano.shared(2.0, name='bar')

In [37]:
foo.eval()  # compile error

MissingInputError: Undeclared input
 
Backtrace when that variable is created:

  File "/home/bfeeney/anaconda3/envs/pymc3-theano-presentation/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/home/bfeeney/anaconda3/envs/pymc3-theano-presentation/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2854, in run_cell
    result = self._run_cell(
  File "/home/bfeeney/anaconda3/envs/pymc3-theano-presentation/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2900, in _run_cell
    return runner(coro)
  File "/home/bfeeney/anaconda3/envs/pymc3-theano-presentation/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "/home/bfeeney/anaconda3/envs/pymc3-theano-presentation/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3098, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/home/bfeeney/anaconda3/envs/pymc3-theano-presentation/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3301, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/home/bfeeney/anaconda3/envs/pymc3-theano-presentation/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3361, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_486973/2200973877.py", line 1, in <cell line: 1>
    foo = T.dscalar('foo')


In [38]:
foo.eval(inputs_to_values={foo: 2})

array(2.)

In [39]:
bar.eval()

array(2.)

In [40]:
foo.set_value(3.0)  # compile error

AttributeError: 'TensorVariable' object has no attribute 'set_value'

In [41]:
bar.set_value(3.0)

In [42]:
bar.eval()

array(3.)

In [43]:
bar.get_value()

array(3.)

# Random Numbers

In [44]:
from theano.tensor.shared_randomstreams import RandomStreams

ModuleNotFoundError: No module named 'theano.tensor.shared_randomstreams'

In [None]:
srng = RandomStreams(seed=0xC0FFEE)
rv_u = srng.uniform((2, 2))
rv_n = srng.normal()

In [None]:
rv_u.eval()

In [None]:
rv_n.eval()

The random number generator above only works on CPUs, to work on a GPU you need to use a special random number generator that can work within the GPU computation constratins (e.g. limited switching)

In [None]:
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

# If-Statements

You can use `switch` or `ifelse`

In [45]:
psgn = -1 if np.random.randn() < 0 else 1
psgn

1

In [46]:
sgn = T.switch(rv_n < 0, -1, 1) 

NameError: name 'rv_n' is not defined

In [None]:
sgn.eval()

In [None]:
from theano.ifelse import ifelse

sgn = ifelse(rv_n < 0, -1, 1) 

In [None]:
sgn.eval()

So what's the difference?

 1. Laziness: 
     * `ifelse(True, a+b, a/b)` only evaluates `a+b`
     * `T.switch(True, a+b, a/b)` evaluates `a+b` and `a/b`
 2. Vectorization
     * `ifelse` can only take a _single_ boolean value as the first parameter
     * `T.switch` can take an array of booleans as the first parameter, and arrays of values for true and false cases. 


In [None]:
b = T.bvector()
trues = T.ivector()
falses = T.ivector()

In [None]:
(T.switch(b, trues, falses)).eval(inputs_to_values={
    b: np.array([True, True, False]),
    trues: np.array([1, 2, 3], dtype=np.int32),
    falses: np.array([-1, -2, -3], dtype=np.int32)
})

# Loops

See documentation on the [scan function](http://deeplearning.net/software/theano/tutorial/loop.html)

# Broadcasting

## Numpy

In numpy we use `np.newaxis` to broadcast (repeat) values across rows/columns

In [None]:
A = np.linspace(1, 9, 9).reshape((3,3))
b = np.array([2, 4, 6])

A

In [None]:
A + b

In [None]:
A + b[np.newaxis, :]

In [None]:
A + b[:, np.newaxis]

## Broadcasting

In [None]:
tA = T.dmatrix()
tb = T.dvector()
tb_col = T.dcol()
tb_row = T.row()

In [None]:
tb.broadcastable

In [None]:
tb_col.broadcastable  # True indicates it's a broadcast (repeating) column

In [None]:
tb_row.broadcastable

In [None]:
(tA + tb).eval(inputs_to_values={tA: A, tb: b})

In [None]:
(tA + tb_row).eval(inputs_to_values={tA: A, tb_row: b[np.newaxis,:]})

In [None]:
(tA + tb_col).eval(inputs_to_values={tA: A, tb_col: b[:,np.newaxis]})

# Running it on a GPU

In [47]:
import time

vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
iters = 1000

rng = np.random.RandomState(22)
x = theano.shared(np.asarray(rng.rand(vlen), theano.config.floatX))
f = theano.function([], T.exp(x))
print(f.maker.fgraph.toposort())
t0 = time.time()
for i in range(iters):
    r = f()
t1 = time.time()
print("Looping %d times took %f seconds" % (iters, t1 - t0))
print("Result is %s" % (r,))
if np.any([isinstance(x.op, T.Elemwise) and
              ('Gpu' not in type(x.op).__name__)
              for x in f.maker.fgraph.toposort()]):
    print('Used the cpu')
else:
    print('Used the gpu')

[Elemwise{exp,no_inplace}(<TensorType(float64, vector)>)]
Looping 1000 times took 2.315705 seconds
Result is [1.23178032 1.61879341 1.52278065 ... 2.20771815 2.29967753 1.62323285]
Used the cpu


Doing any GPU computation on a Mac is hard, so off to Google Colab we go....