# Tensorflow

In [None]:
import tensorflow as tf

import numpy as np

from sklearn import datasets

from load_cifar import load_cifar

TensorFlow is a deep learning framework brought to you by Google. It allows you to build computational graphs from tensors and operations on them and then helps those _tensors flow_.

As in PyTorch, we'll start with the Iris dataset.

In [None]:
iris = datasets.load_iris()
X_train = iris['data']
y_train = iris['target']
# We'll train on the whole dataset - don't ever do that - but for ilustrating behaviour it's good enough!

A computational graph is made of:
* placeholders (inputs to the graph)
* variables
* operations on them and their results

In [None]:
# of course, you can also define your own operations - tensorflow's syntax is in many ways similar to numpy's 
def relu(activation):
    return activation * tf.cast((activation > 0), dtype=tf.float32)

In [None]:
D_in, H, D_out = 4, 10, 3

X = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.int32, shape=(None))

L = tf.one_hot(y, D_out)
W1 = tf.Variable(tf.random_uniform((D_in, H)))
W2 = tf.Variable(tf.random_uniform((H, D_out)))

# and our graph is buit here:
y_pred = relu(X @ W1) @ W2 

print(L.shape)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=L, logits=y_pred))


# this is a more explicit, but also impractical AF method of performing gradient descent
grad_W1, grad_W2 = tf.gradients(loss, [W1, W2])

lr = 1e-2

new_W1 = W1.assign(W1 - lr * grad_W1)
new_W2 = W2.assign(W2 - lr * grad_W2)
updates = tf.group(new_W1, new_W2)


In tensorflow, as opposed to PyTorch graphs are constructed statically - which means you need to define them in your code and cannot change later.

An interesting thing to note is that a graph can have many inputs and many outputs, such as `y_pred` and `loss` here

Another important thing to know is that nothing has been calculated or initialized yet!

How does the training go?

In [None]:
train_dict = {X: X_train, y: y_train}

num_iterations = 500

with tf.Session() as sess:
    with tf.device("/gpu:0"): #"/cpu:0" or "/gpu:0"
        tf.global_variables_initializer().run()
        
        for i in range(num_iterations):
            loss_val, _ = sess.run([loss, updates], feed_dict=train_dict)
            if i % 50 == 0: print(loss_val)
        

Let's train again, but without perfoming Gradient Descent manually

In [None]:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

train_dict = {X: X_train, y: y_train}

num_iterations = 500

with tf.Session() as sess:
    with tf.device("/gpu:0"): #"/cpu:0" or "/gpu:0"
        tf.global_variables_initializer().run()
        for i in range(num_iterations):
            train_step.run(feed_dict=train_dict)
            loss_val = loss.eval(feed_dict=train_dict)
            if i % 50 == 0: print(loss_val)

## As for CIFAR-10

Let's now try to train a network on a more serious dataset!

In [None]:
X_train, y_train, X_test, y_test = load_cifar()

X_train = X_train.reshape(-1, 32, 32, 3)
X_test = X_test.reshape(-1, 32, 32, 3)

Tensorflow provides implementations of layers in it's `layers` module. However, as opposed to PyTorch, it doesn't have one single way to minimize the amount of written code when creating the model.

Some of the high-level wrappers include:

* `tf.layers`
* `TFLearn`
* `Estimator API`
* `Pretty Tensor`
* `Keras`

First, let's take a look at the Estimators

In [None]:
def my_cnn(features, labels, mode):
    
    X = tf.cast(features["x"], tf.float32)    
    reg = tf.contrib.layers.l2_regularizer(scale=0.01)
    # convolutional layers
    # 3 - 5 - 7
    h1 = tf.layers.conv2d(X, 32, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    h1_1 = tf.layers.conv2d(h1, 32, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    h1_2 = tf.layers.conv2d(h1_1, 32, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    
    
    h1_pool = tf.layers.max_pooling2d(h1_2, (2, 2), (2, 2))
    h1_batchnorm = tf.layers.batch_normalization(h1_pool)
    
    h2 = tf.layers.conv2d(h1_batchnorm, 64, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    h2_1 = tf.layers.conv2d(h2, 64, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    h2_2 = tf.layers.conv2d(h2_1, 64, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')

    h2_pool = tf.layers.max_pooling2d(h2_2, (2, 2), (2, 2))
    h2_batchnorm = tf.layers.batch_normalization(h2_pool)
    
    h3 = tf.layers.conv2d(h2_batchnorm, 128, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    h3_1 = tf.layers.conv2d(h3, 128, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    h3_2 = tf.layers.conv2d(h3_1, 128, 5, activation=tf.nn.relu, kernel_regularizer=reg, padding='same')
    h3_batchnorm = tf.layers.batch_normalization(h3_2)

    # dense layers
    d0 = tf.layers.flatten(h3_batchnorm)
    
    d1 = tf.layers.dense(d0, 2048, activation=tf.nn.relu, kernel_regularizer=reg)
    d1_batchnorm = tf.layers.batch_normalization(d1)    
    
    d2 = tf.layers.dense(d1_batchnorm, 1024, activation=tf.nn.relu, kernel_regularizer=reg)
    d2_batchnorm = tf.layers.batch_normalization(d2)    
    
    d3 = tf.layers.dense(d2_batchnorm, 10, kernel_regularizer=reg)
    
    y_out = d3
        
    predictions = {
        "classes": tf.argmax(input=y_out, axis=1),
        "probabilities": tf.nn.softmax(y_out, name="softmax_tensor")
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    l2_loss = tf.losses.get_regularization_loss()
    onehot_labels = tf.one_hot(indices=labels, depth=10)
    softmax_loss = tf.losses.softmax_cross_entropy(logits=y_out, onehot_labels=onehot_labels)
    loss = l2_loss + softmax_loss
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_global_step()
        start_lr = 3e-4
        lr = tf.train.exponential_decay(start_lr, global_step, 500, 0.9, staircase=True)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss=loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
    eval_metric = {
        "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric)

In order for estimator to work, we must provide input functions for it:

In [None]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": X_train},
    y=y_train,
    batch_size=256,
    num_epochs=None,
    shuffle=True    
)

test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": X_test},
    y=y_test,
    num_epochs=1,
    shuffle=False    
)

# this differs from test_input_fn in its number of epochs
# if num_epochs == None, input_fn returns data for as long as we want it to (so it's good for training)
train_test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": X_train},
    y=y_train,
    batch_size=256,
    num_epochs=1,
    shuffle=True    
)

In [None]:
model = tf.estimator.Estimator(model_fn=my_cnn, model_dir='/tmp/my_cnn')

In [None]:
X_train.shape

In [None]:
model.train(input_fn=train_input_fn, steps=4000)

In [None]:
test_results = model.evaluate(input_fn=test_input_fn)
train_results = model.evaluate(input_fn=train_test_input_fn)
print('train', train_results)
print('test', test_results)


## That's a lot of boilerplate code! Can it be simplified?

In [None]:
from tensorflow import keras

Yup.

Keras lets you define and train models much more simply. It uses Tensorflow (or Theano if you want, but Theano is dead) as a backend.

In [None]:
model = keras.models.Sequential([
    keras.layers.Conv2D(32, 5, activation='relu', padding='same', input_shape=(32, 32, 3)),
    keras.layers.Conv2D(32, 5, activation='relu', padding='same'),
    keras.layers.Conv2D(32, 5, activation='relu', padding='same'),
    
    keras.layers.MaxPooling2D(2, 2),
    keras.layers.BatchNormalization(),
    
    keras.layers.Conv2D(64, 5, activation='relu', padding='same', input_shape=(32, 32, 3)),
    keras.layers.Conv2D(64, 5, activation='relu', padding='same'),
    keras.layers.Conv2D(64, 5, activation='relu', padding='same'),
    
    keras.layers.MaxPooling2D(2, 2),
    keras.layers.BatchNormalization(),
    
    keras.layers.Conv2D(128, 5, activation='relu', padding='same', input_shape=(32, 32, 3)),
    keras.layers.Conv2D(128, 5, activation='relu', padding='same'),
    keras.layers.Conv2D(128, 5, activation='relu', padding='same'),
    
    keras.layers.MaxPooling2D(2, 2),
    keras.layers.BatchNormalization(),
    keras.layers.Flatten(),

    keras.layers.Dense(10, activation='relu')
])
model.compile(loss='categorical_crossentropy', 
              optimizer=keras.optimizers.Adam(), 
              metrics=['accuracy']) 

Training is much easier to call and also more verbose

In [None]:
y_train_onehot = keras.utils.to_categorical(y_train, 10)

model.fit(X_train, y_train_onehot, epochs=10)

In [None]:
y_test_onehot = keras.utils.to_categorical(y_test, 10)

model.evaluate(X_test, y_test_onehot)

As you can see, on the outside the philosophy is similar to PyTorch. However, computational graphs here are still static - you cannot edit them that easily after creation!

Funnily enough, implementing your own layers in Keras is similar to PyTorch with one exception - in order for shape inference, you need to implement the function which will, well infer it :P

Read more here!

https://keras.io/layers/writing-your-own-keras-layers/