In [1]:
import os
# specify which GPU to use
# And also limit which GPU consumes memory (especially for tensorflow use)
os.environ['CUDA_VISIBLE_DEVICES']='0'
os.system('echo $CUDA_VISIBLE_DEVICES')

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

config = tf.ConfigProto()
# set tensorflow not consume all memory, dynamically allocate memory 
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.models import Model
from keras.layers import Convolution2D, MaxPooling2D, Flatten, Dropout, Input, merge, Dense

<img src="./two_stream.jpg">

In [11]:
# spatial stream
spatial_input = Input((224,224,3))
# conv1
x = Convolution2D(filters=96,kernel_size=(7,7), activation='relu', 
                  padding='same', name='spatial_conv1')(spatial_input)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv2
x = Convolution2D(filters=256,kernel_size=(5,5), activation='relu', 
                  padding='same', name='spatial_conv2')(x)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv3
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='spatial_conv3')(x)
# conv4
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='spatial_conv4')(x)
# conv5
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='spatial_conv5')(x)
x = MaxPooling2D(pool_size=(2,2))(x)

# Flatten
x = Flatten()(x)
# FC6
x = Dense(4096, activation='relu', name='spatial_full6')(x)
x = Dropout(0.9)(x) # 90% of the units will be dropped out.
# FC7
x = Dense(2048, activation='relu', name='spatial_full7')(x)
x = Dropout(0.7)(x)
# final layer
y = Dense(10, activation='softmax')(x)

In [12]:
# construct a model
spatial_model = Model(spatial_input, y)

In [13]:
spatial_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
spatial_conv1 (Conv2D)       (None, 224, 224, 96)      14208     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 112, 112, 96)      0         
_________________________________________________________________
spatial_conv2 (Conv2D)       (None, 112, 112, 256)     614656    
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 56, 56, 256)       0         
_________________________________________________________________
spatial_conv3 (Conv2D)       (None, 56, 56, 512)       1180160   
_________________________________________________________________
spatial_conv4 (Conv2D)       (None, 56, 56, 512)       2359808   
__________

In [14]:
# temporal stream
# select 10 frames for optical flow, for each frame, there are two channels: u & v
temporal_input = Input((224,224,20)) # input is optical flow
# conv1
x = Convolution2D(filters=96,kernel_size=(7,7), activation='relu', 
                  padding='same', name='temporal_conv1')(temporal_input)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv2
x = Convolution2D(filters=256,kernel_size=(5,5), activation='relu', 
                  padding='same', name='temporal_conv2')(x)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv3
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='temporal_conv3')(x)
# conv4
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='temporal_conv4')(x)
# conv5
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='temporal_conv5')(x)
x = MaxPooling2D(pool_size=(2,2))(x)

# Flatten
x = Flatten()(x)
# FC6
x = Dense(4096, activation='relu', name='temporal_full6')(x)
x = Dropout(0.9)(x) # 90% of the units will be dropped out.
# FC7
x = Dense(2048, activation='relu', name='temporal_full7')(x)
x = Dropout(0.7)(x)
# final layer
y = Dense(10, activation='softmax')(x)

In [15]:
# construct a model
temporal_model = Model(temporal_input, y)

In [16]:
temporal_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 224, 224, 20)      0         
_________________________________________________________________
temporal_conv1 (Conv2D)      (None, 224, 224, 96)      94176     
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 112, 112, 96)      0         
_________________________________________________________________
temporal_conv2 (Conv2D)      (None, 112, 112, 256)     614656    
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 56, 56, 256)       0         
_________________________________________________________________
temporal_conv3 (Conv2D)      (None, 56, 56, 512)       1180160   
_________________________________________________________________
temporal_conv4 (Conv2D)      (None, 56, 56, 512)       2359808   
__________

In [3]:
# 1. Fusion at the decision layer (not powerful)
# 2. Fusion at the middle layer

# spatial stream
spatial_input = Input((224,224,3))
# conv1
x = Convolution2D(filters=96,kernel_size=(7,7), activation='relu', 
                  padding='same', name='spatial_conv1')(spatial_input)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv2
x = Convolution2D(filters=256,kernel_size=(5,5), activation='relu', 
                  padding='same', name='spatial_conv2')(x)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv3
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='spatial_conv3')(x)
# 把空域的分支引出來
spatial_output = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                               padding='same', name='spatial_conv4')(x)

# temporal stream
# select 10 frames for optical flow, for each frame, there are two channels: u & v
temporal_input = Input((224,224,20)) # input is optical flow
# conv1
x = Convolution2D(filters=96,kernel_size=(7,7), activation='relu', 
                  padding='same', name='temporal_conv1')(temporal_input)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv2
x = Convolution2D(filters=256,kernel_size=(5,5), activation='relu', 
                  padding='same', name='temporal_conv2')(x)
x = MaxPooling2D(pool_size=(2,2))(x)
# conv3
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='temporal_conv3')(x)
# 把時域的分支引出來
temporal_output = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                                padding='same', name='temporal_conv4')(x)

# use Merge layer (sum mode)for fusion
fusion_output = merge([spatial_output, temporal_output], mode='sum')

# conv5
x = Convolution2D(filters=512,kernel_size=(3,3), activation='relu', 
                  padding='same', name='conv5')(fusion_output)
x = MaxPooling2D(pool_size=(2,2))(x)
# Flatten
x = Flatten()(x)
# FC6
x = Dense(4096, activation='relu', name='full6')(x)
x = Dropout(0.9)(x) # 90% of the units will be dropped out.
# FC7
x = Dense(2048, activation='relu', name='full7')(x)
x = Dropout(0.7)(x)
# final layer
y = Dense(10, activation='softmax')(x)

  name=name)


In [4]:
# construct the model
model = Model([spatial_input, temporal_input], y)

In [5]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 224, 224, 3)   0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 224, 224, 20)  0                                            
____________________________________________________________________________________________________
spatial_conv1 (Conv2D)           (None, 224, 224, 96)  14208       input_1[0][0]                    
____________________________________________________________________________________________________
temporal_conv1 (Conv2D)          (None, 224, 224, 96)  94176       input_2[0][0]                    
___________________________________________________________________________________________

In [25]:
!pip install graphviz

Collecting graphviz
  Using cached https://files.pythonhosted.org/packages/84/44/21a7fdd50841aaaef224b943f7d10df87e476e181bb926ccf859bcb53d48/graphviz-0.8.3-py2.py3-none-any.whl
Installing collected packages: graphviz
Successfully installed graphviz-0.8.3
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
from keras.utils import plot_model

In [7]:
# Draw the network structures
from keras.utils import plot_model
plot_model(model, to_file='./two_stream_model.png', show_shapes=True, show_layer_names=True)

In [None]:
# 3. Fusion with residual

In [8]:
# import resnet50 model
from keras.applications.resnet50 import conv_block, identity_block