In [1]:
import glob
import pickle
import os
import numpy as np
from threading import Thread
from multiprocessing import Process
from time import sleep
from keras.models import load_model
import tensorflow as tf

Using TensorFlow backend.


In [2]:
def read_from_file(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

In [3]:
def read_train(path):
    train_data = []
    labels = []
    for file in glob.glob(os.path.join(path, 'data_batch*')):
        data = read_from_file(file)
        labels.append(data[b'labels'])
        train_data.append(data[b'data'])
    
    train_data = np.concatenate(train_data)
    labels = np.concatenate(labels)
    
    return train_data, labels

In [4]:
x_train, y_train = read_train('Cifar10')
y_labels = np.eye(10)[y_train]
x_train = x_train.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)

In [5]:
import math
def split_in_batches(data, batch_size):
    ret = []
    start = 0
    end = 0
    num_batches = math.ceil(data.shape[0] / batch_size)
    for batch_num in range(num_batches):
        end += batch_size
        if end > data.shape[0]:  #last batch
            ret.append(data[-batch_size:])
        else:
            ret.append(data[start:end])
        start = end
    return np.array(ret)

In [6]:
batches = split_in_batches(x_train, 1024)

Loads the two models from a previously trained model. 

Note that this is the only usage of Keras (for building and training the net). Keras uses tensors to build its net, so once we get the last tensor there would be no difference between keras and a pure tensorflow graph.

In [7]:
with tf.device('/gpu:0'):
    model_gpu = load_model('model1.h5')
    x_gpu = tf.placeholder(tf.float32, shape=(None, 32,32,3))
with tf.device('/cpu:0'):
    model_cpu = load_model('model1.h5')
    x_cpu = tf.placeholder(tf.float32, shape=(None, 32,32,3))

Use the same example as defined in the tensorflow tutorial but placing one CPU and one GPU instead of two GPUs

In [8]:
model = load_model('model1.h5')
x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))

In [9]:
c_2 = []   # Concat the result of the two tensor but with the same input tensor
for dev in ['/gpu:0', '/cpu:0']:
    with tf.device(dev):
        c_2.append(model(x))
with tf.device('/cpu:0'):
    predictions_2 = tf.concat(c_2, 0)
    

We thought that maybe one bottleneck was the placement of the placeholder. So we also tried to instantiate two different placeholders for the two models, each of them is declared in CPU and GPU respectively

In [10]:
c = []   # Concat the result of the two tensors with diffent input tensor
with tf.device('/gpu:0'):
    c.append(model_gpu(x_gpu))
with tf.device('/cpu:0'):
    c.append(model_cpu(x_cpu))

with tf.device('/cpu:0'):
    predictions = tf.concat(c, 0)

Initialize a session

In [11]:
# Initialize all variables
session = tf.Session(config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
session.run(tf.global_variables_initializer())

As explained in the tensorflow tutorial, this tensor (predictions_2) should automatically split the workload across both the devices, if the devices were both gpu cards. Here we put one GPU and one CPU.

In [12]:
%%timeit -n 10
for b in batches:
    session.run(predictions_2, feed_dict = {x : b})

2.21 s ± 71.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


If the workload was divided equally between CPU and GPU, I would expect CPU usage to be 100%. Instead for this run it is fixed at about 30%

Here we wanted to try out our model with the different placeholders.

In [13]:
# Run all on the GPU
%timeit -n 10 ret = session.run(predictions, feed_dict={x_cpu : x_train[:1], x_gpu : x_train[1:1024]})

57.2 ms ± 5.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
# Run 50% on CPU and 50% on GPU
%timeit -n 10 ret = session.run(predictions, feed_dict={x_cpu : x_train[:512], x_gpu : x_train[512:1024]})

276 ms ± 3.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
# Run 100% on CPU
%timeit -n 10 ret = session.run(predictions, feed_dict={x_cpu : x_train[1:1024], x_gpu : x_train[:1]})

558 ms ± 6.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
# Run most on GPU, few instances on CPU
%timeit -n 10 ret = session.run(predictions, feed_dict={x_cpu : x_train[:2], x_gpu : x_train[2:1024]})

62.7 ms ± 9.89 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Here the time of the last run, with only one more batch for CPU, I would expect it to be less than the time of using only the GPU (here the GPU is clearly the bottleneck)

## Using thread alternative

In [43]:
def predict_on_device(session, predict_tensor, batches):
    #session = tf.Session()
    #session.run(tf.global_variables_initializer())
    for batch in batches:
        session.run(predict_tensor, feed_dict={x : batch})
        

In [44]:
def split_cpu_gpu(batches, num_batches_cpu, tensor_cpu, tensor_gpu):
    session = tf.Session(config=tf.ConfigProto(log_device_placement=True, intra_op_parallelism_threads=8))
    session.run(tf.global_variables_initializer())
    
    coord = tf.train.Coordinator()
    
    threads = []
    #threads += [Thread(target=predict_on_device, args=(session, tensor_cpu, batches))]
    threads += [Thread(target=predict_on_device, args=(session, tensor_gpu, batches))]
    
    for t in threads:
        t.start()
    
    coord.join(threads)
    
    session.close()
    

Reload all the two models

In [46]:
with tf.device('/gpu:0'):
    x = tf.placeholder(tf.float32, shape=(None, 32,32,3))
    model_gpu = load_model('model1.h5')
    tensor_gpu = model_gpu(x)
    
with tf.device('/cpu:0'):
    x = tf.placeholder(tf.float32, shape=(None, 32,32,3))
    model_cpu = load_model('model1.h5')
    tensor_cpu = model_cpu(x)


Run all the batches, first only on GPU, then 20 on CPU and other in GPU and finally 40 on CPU and the other in GPU

In [45]:
%%time
split_cpu_gpu(batches[:10], 0, tensor_cpu, tensor_gpu)

Exception in thread Thread-42:
Traceback (most recent call last):
  File "c:\users\andre\anaconda3\envs\mlenv\lib\site-packages\tensorflow\python\client\session.py", line 1022, in _do_call
    return fn(*args)
  File "c:\users\andre\anaconda3\envs\mlenv\lib\site-packages\tensorflow\python\client\session.py", line 1004, in _run_fn
    status, run_metadata)
  File "c:\users\andre\anaconda3\envs\mlenv\lib\contextlib.py", line 66, in __exit__
    next(self.gen)
  File "c:\users\andre\anaconda3\envs\mlenv\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 466, in raise_exception_on_not_ok_status
    pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder_285' with dtype float
	 [[Node: Placeholder_285 = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
	 [[Node: sequential_14/dense_2/Softmax/_9 = _Recv[client_terminated=false, recv_d

Wall time: 6.86 s


In [21]:
%%timeit -c 10
split_cpu_gpu(batches, 20, tensor_cpu, tensor_gpu)

14 s ± 104 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit -c 10
split_cpu_gpu(batches, 40, tensor_cpu, tensor_gpu)

24.2 s ± 33.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Notice that this 'thread' thing takes up much more time than the one did before (around 2 sec). I honestly can't explain this difference (maybe the overhead for creating the thread is relevant, but it doesn't explain the whole 2 seconds delay, because there shouldn't be any conflicts among the two threads (all the variables are different)

# Prediction only on GPU

In [29]:
with tf.device('/gpu:0'):
    pred_only_gpu = model_gpu(x_gpu)
session = tf.Session(config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
session.run(tf.global_variables_initializer())

In [30]:
%%timeit -n 10
for b in batches:
    session.run(pred_only_gpu, feed_dict = {x_gpu : b})

2.2 s ± 7.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
session.close()

To compare the previous results, running the prediction on only the GPU takes only 2.2 seconds, with is less than all the other possibilities we have tried so far..