In [60]:
from keras.models import Sequential, Model
from keras import layers
from keras import Input
import numpy as np

In [2]:
# Build the sequential model first
seq_model = Sequential()
seq_model.add(layers.Dense(32, activation='relu', input_shape=(64,)))
seq_model.add(layers.Dense(32, activation='relu'))
seq_model.add(layers.Dense(10, activation='softmax'))
seq_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 10)                330       
Total params: 3,466
Trainable params: 3,466
Non-trainable params: 0
_________________________________________________________________


In [3]:
# The same model using functional APIs
input_tensor = Input(shape=(64,))
x = layers.Dense(32, activation='relu')(input_tensor)
x = layers.Dense(32, activation='relu')(x)
output_tensor = layers.Dense(10, activation='softmax')(x)
model = Model(input_tensor, output_tensor)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_6 (Dense)              (None, 10)                330       
Total params: 3,466
Trainable params: 3,466
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Generate dummy numpy data to train on
x_train = np.random.random((1000, 64))
y_train = np.random.random((1000, 10))

# Train the model for 10 epochs
model.fit(x_train, y_train, epochs=10, batch_size=128)

# Evaluate the model
score = model.evaluate(x_train, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
  32/1000 [..............................] - ETA: 0s

# Multi-Input Models

## Motivating Example
A typical question-answering model has two inputs: a natural language question, and a text snippet (such as a news article) providing information to be used for answering the question. The model must then produce a answer: in the simplest possible setup, this is simply a one-word answer.

![arch](images/multi_input_arch.png)
 

In [21]:
from keras.models import Model
from keras import layers
from keras import Input

text_vocab_size = 10000
question_vocab_size = 10000
answer_vocab_size = 500

# Text input is a variable-length sequence of integers
text_input = Input(shape=(200,), dtype='int32', name='text')

# Which is embedded into a sequence of vectors of size 64
embedded_text = layers.Embedding(text_vocab_size, 64)(text_input)

# Which we encode in a single vector via LSTM
encoded_text = layers.LSTM(32)(embedded_text)

# Same process with the question
question_input = Input(shape=(200,), dtype='int32', name='question')
embedded_question = layers.Embedding(question_vocab_size, 32)(question_input)
encoded_question = layers.LSTM(16)(embedded_question)

# We then concatenate the encoded question and the encoded text
concatenated = layers.concatenate([encoded_text, encoded_question], axis=-1)

# And we add a softmax classifier on top
answer = layers.Dense(answer_vocab_size, activation='softmax')(concatenated)

# At the model instantiation we specify the two inputs and the single output
model = Model([text_input, question_input], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
text (InputLayer)                (None, 200)           0                                            
____________________________________________________________________________________________________
question (InputLayer)            (None, 200)           0                                            
____________________________________________________________________________________________________
embedding_14 (Embedding)         (None, 200, 64)       640000      text[0][0]                       
____________________________________________________________________________________________________
embedding_15 (Embedding)         (None, 200, 32)       320000      question[0][0]                   
___________________________________________________________________________________________

In [22]:
num_samples = 1000
max_length = 200

# Lets generate some random numpy data
text = np.random.randint(1, text_vocab_size, size=(num_samples, max_length))
question = np.random.randint(1, question_vocab_size, size=(num_samples, max_length))

# Answers are one-hot encoded, not integers
answers = np.random.randint(0, 1, size=(num_samples, answer_vocab_size))

# Fit using a list of inputs
model.fit([text, question], answers, epochs=10, batch_size=128)

# Fit using a dict of inputs provided the inputs were named
# model.fit({'text': text, 'question': question}, answers, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13afde6a0>

# Multi-Output Models

## Motivating Example
A network that attempts to simultaneously predict different properties of the data: lets say, a network that as input a series of social media posts from one single anonymous person, and tries to predict attributes of that person such as age, gender, and income.

![arch](images/multi_output_arch.png)

In [26]:
from keras import layers
from keras import Input
from keras.models import Model

vocab_size = 50000
num_income_groups = 10

posts_input = Input(shape=(None,), dtype='int32', name='posts')
embedded_posts = layers.Embedding(vocab_size, 256)(posts_input)
x = layers.Conv1D(128, 5, activation='relu')(embedded_posts)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation='relu')(x)

# Note we are giving names to output layers
age_prediction = layers.Dense(1, name='age')(x)
income_prediction = layers.Dense(num_income_groups, activation='softmax', name='income')(x)
gender_prediction = layers.Dense(1, activation='sigmoid', name='gender')(x)

model = Model(posts_input, [age_prediction, income_prediction, gender_prediction])

# Even though we are specifying the losses for each head, when optimizing
# they all get summed up and then minimized.
model.compile(
    optimizer='rmsprop', 
    loss={
        'age': 'mse', 
        'income': 'categorical_crossentropy', 
        'gender': 'binary_crossentropy'
    }
)

# Can also specify loss weights. Otherwise the head that has the loss
# with the greatest magnitude will get optimized better in preference
# to the other heads
model.compile(
    optimizer='rmsprop', 
    loss={
        'age': 'mse', 
        'income': 'categorical_crossentropy', 
        'gender': 'binary_crossentropy'
    },
    loss_weights={
        'age': 0.25,
        'income': 1,
        'gender': 10.
    }
)

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
posts (InputLayer)               (None, None)          0                                            
____________________________________________________________________________________________________
embedding_19 (Embedding)         (None, None, 256)     12800000    posts[0][0]                      
____________________________________________________________________________________________________
conv1d_16 (Conv1D)               (None, None, 128)     163968      embedding_19[0][0]               
____________________________________________________________________________________________________
max_pooling1d_7 (MaxPooling1D)   (None, None, 128)     0           conv1d_16[0][0]                  
___________________________________________________________________________________________

In [31]:
num_samples = 100
maxlen = 200
posts = np.random.randint(1, vocab_size, size=(num_samples, maxlen))
age_targets = np.random.randint(20, 75, num_samples)
income_targets = np.random.randint(0, 1, size=(num_samples, num_income_groups))
gender_targets = np.random.randint(0, 1, num_samples)

# Can specify the targets positionally
# model.fit(posts, [age_targets, income_targets, gender_targets], epochs=10, batch_size=128)

# Or use a dict if the target layers were named in the model
model.fit(
    posts, 
    {
        'age': age_targets,
        'income': income_targets,
        'gender': gender_targets
    },
    epochs=10,
    batch_size=128
)

Epoch 1/10


InvalidArgumentError: computed output size would be negative
	 [[Node: conv1d_20/convolution/Conv2D = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/cpu:0"](conv1d_20/convolution/ExpandDims, conv1d_20/convolution/ExpandDims_1)]]

Caused by op 'conv1d_20/convolution/Conv2D', defined at:
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-d3892ccc088c>", line 16, in <module>
    x = layers.Conv1D(256, 5, activation='relu')(x)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/keras/engine/topology.py", line 602, in __call__
    output = self.call(inputs, **kwargs)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/keras/layers/convolutional.py", line 156, in call
    dilation_rate=self.dilation_rate[0])
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 3124, in conv1d
    data_format=tf_data_format)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 672, in convolution
    op=op)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 338, in with_space_to_batch
    return op(input, num_spatial_dims, padding)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 664, in op
    name=name)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 116, in _non_atrous_convolution
    name=scope)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 2013, in conv1d
    data_format=data_format)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 397, in conv2d
    data_format=data_format, name=name)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/avilay/anaconda/envs/dl/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): computed output size would be negative
	 [[Node: conv1d_20/convolution/Conv2D = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/cpu:0"](conv1d_20/convolution/ExpandDims, conv1d_20/convolution/ExpandDims_1)]]


# DAGs as Model
Inception module

![arch](images/dag_arch.png)

In [None]:
from keras import layers

# We assume the existance of a 4D input tensor 'x'

# Every branch has the same stride value (2), which is necessary to keep 
# all branch outputs the same size, for concatenation later on.
branch_a = layers.Conv2D(128, 1, activation='relu', strides=2)(x)

# In this branch, the striding occurs in the spatial convolutional layer
branch_b = layers.Conv2D(128, 1, activation='relu')(x)
branch_b = layers.Conv2D(128, 3, activation='relu', strides=2)(branch_b)

# In this branch, the striding occurs in the average pooling layer
branch_c = layers.AveragePooling2D(3, strides=2, activation='relu')(x)
branch_c = layers.Conv2D(128, 3, activation='relu')(branch_c)

branch_d = layers.Conv2D(128, 1, activation='relu')(x)
branch_d = layers.Conv2D(128, 3, activation='relu')(branch_d)
branch_d = layers.Conv2D(128, 3, activation='relu', strides=2)(branch_d)

output = layers.concatenate([branch_a, branch_b, branch_c, branch_d], axis=-1)

# Residual Connections

A residual connection simply consist of making the output of an earlier layer available as input to a later layer, effectively creating a shortcut in a sequential network. Rather than being concatenated to the later activation, the earlier output is summed with the later activation, which assumes that both activations have the same size. In case of differing sizes, use some sort of linear transformation to resize.

In [39]:
# When the layers being added are the same shape
from keras import layers

# We assume the existance of a 4D input tensor 'x'
x = Input(shape=(32, 32, 3))

# We apply some transformation to x
y = layers.Conv2D(3, 5, activation='relu', padding='same')(x)
y = layers.Conv2D(3, 5, activation='relu', padding='same')(y)
y = layers.Conv2D(3, 5, activation='relu', padding='same')(y)

# We add the original x back to the output features
y = layers.add([y, x])

model = Model(x, y)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_12 (InputLayer)            (None, 32, 32, 3)     0                                            
____________________________________________________________________________________________________
conv2d_21 (Conv2D)               (None, 32, 32, 3)     228         input_12[0][0]                   
____________________________________________________________________________________________________
conv2d_22 (Conv2D)               (None, 32, 32, 3)     228         conv2d_21[0][0]                  
____________________________________________________________________________________________________
conv2d_23 (Conv2D)               (None, 32, 32, 3)     228         conv2d_22[0][0]                  
___________________________________________________________________________________________

In [61]:
# When the layers being added are of different sizes
x = Input(shape=(32, 32, 3))

y = layers.Conv2D(3, 5, activation='relu')(x)  # y.shape = 28, 28, 3
y = layers.Conv2D(3, 5, activation='relu')(y)  # y.shape = 24, 24, 3
y = layers.Conv2D(3, 3, activation='relu')(y)  # y.shape = 22, 22, 3
y = layers.MaxPooling2D(2, strides=2)(y)       # y.shape = 11, 11, 3

# Downsample x using a 1x1 convolution
residual = layers.Conv2D(3, 1, strides=3)(x)   # residual.shape = 11, 11, 3

y = layers.add([y, residual])
model = Model(x, y)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_22 (InputLayer)            (None, 32, 32, 3)     0                                            
____________________________________________________________________________________________________
conv2d_59 (Conv2D)               (None, 28, 28, 3)     228         input_22[0][0]                   
____________________________________________________________________________________________________
conv2d_60 (Conv2D)               (None, 24, 24, 3)     228         conv2d_59[0][0]                  
____________________________________________________________________________________________________
conv2d_61 (Conv2D)               (None, 22, 22, 3)     84          conv2d_60[0][0]                  
___________________________________________________________________________________________

# Siamese Networks

The main property of these networks is that different layers share the same weights. These networks start out with two different branches and then merge later on. To look at they look similar to the mult-input models. However, the fundamental difference is that both branches share the same weights. When they are being trained, the gradients coming down both branches are added up before being subtracted from the common weights so that both the branches continue to have the same weights throughout the training.

## Motivating Examples
The main application is to learn some sort of a similarity metric (or a discriminative metric) that can distinguish between two similar but different samples. Another defining characteristic is that the samples within the pair are interchangeable, i.e., if I run sample1 on branch1 and sample2 on branch2 and get two vector representations - vector1 and vector2 which are then compared and found to be same (or different), then if I swap the samples and provide sample1 to branch2 and sample2 to branch1, it will produce vectors vector2 and vector1 which will found to have the same distance (or any other similarity metric).

### Face Verification
A Siamese network that learns to distinguish between images of people, when the two images are of the same person, the network outputs a small number, when the two images are of different people, the network outputs a big number. A concrete example is to use face scans at turnstiles instead of ID cards to allow access to a building. One way to speed up inference is to store the vector representation of all known people who are allowed access in a database. Then when a person scans their face, only the branch of the network is deployed which outputs the vector representation of this person's face. This vector representation is compared with all other vector reprsentation and the closest one (within a certain threshold) is chosen as the "owner" of that face.

### One-Shot Image Recognition
A Siamese network that is first trained on a few images, some of the same object, some of different objects. Once the network has learnt the similarity metric, it can then be given images that belong to completely different classes from anything seen in the training set, and can still discriminate between images of the same object vs. images of different objects.

### Signature Verfication
A Siamese network that is trained on some pairs of signatures that may or may not belong to the same person. The branches learn a vector reprsentation (a.k.a the metric) and the "head" or the end layers discriminate between the two signatures. 

### Semantic Similarity
A model that attempts to assess the semantic similarity between two sentances. The model would have two inputs (the two sentances to compare) and would output a score between 0 and 1 depending on whether they  are different or similar. This could be used for deduping natural language queries in a dialog system. As usual, the two sentances are interchangeable, because semantic similarity is a symmetrical relationship: the similarity of A to B is identical to the similarity between B to A.

In [None]:
from keras import layers, Input
from keras.models import Model

# Instantiate a single LSTM layer
lstm = layers.LSTM(32)

# Build the left branch of the model - inputs are variable length sequences of vectors of size 128
left_input = Input(shape=(None, 128))
left_output = lstm(left_input)

# Buildng the right branch of the model
right_input = Input(shape=(None, 128))
right_output = lstm(right_input)

merged = layers.concatenate([left_output, right_output], axis=-1)
predictions = layers.Dense(1, activation='sigmoid')(merged)

model = Model([left_input, right_input], predictions)

# When this model is learnt, the weights of the lstm layers are updated based on both inputs
model.fit([left_data, right_data], targets)

In addition to having common layers, entire models, i.e., `Model` objects themselves can also act as "layer" and can be used in a Siamese network. Here is another motivating example.

A vision model that uses a dual camera as its inputs: two parallel camersa, a few centimeters apart from each other. Such a model could be capable of preceiving depth, which can be useful in many applications. You shouldn't need two independant models for extracting visual features from the two cameras before merging the feeds. Such low-leve processing can be shared across the two inputs, i.e., done via layers that use the same weights and thus share the same representations.

In [None]:
from keras import layers, applications, Input

# Our base image processing model will be the Xception network
# But only its convolution base, not its final  classification layer.
xception_base = applications.Xception(weights=None, include_top=False)

# Inputs are 250x250 RGB images
left_input = Input(shape=(250, 250, 3))
right_input = Input(shape=(250, 250, 3))

# We call the same vision model twice
left_features = xception_base(left_input)
right_features = xception_base(right_input)

# The merged features contain information from both feeds
merged_features = layers.concatenate([left_features, right_features], axis=-1)
