In [155]:
import glob
import pickle
import os
import numpy as np
from threading import Thread
from multiprocessing import Process
from time import sleep

In [2]:
def read_from_file(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

In [3]:
def read_train(path):
    train_data = []
    labels = []
    for file in glob.glob(os.path.join(path, 'data_batch*')):
        data = read_from_file(file)
        labels.append(data[b'labels'])
        train_data.append(data[b'data'])
    
    train_data = np.concatenate(train_data)
    labels = np.concatenate(labels)
    
    return train_data, labels

In [4]:
x_train, y_train = read_train('Cifar10')
y_labels = np.eye(10)[y_train]
x_train = x_train.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)

In [5]:
from keras.models import load_model

Using TensorFlow backend.


In [6]:
import tensorflow as tf

In [51]:
model = load_model('model1.h5')

In [46]:
with tf.device('/gpu:0'):
    model_gpu = load_model('model1.h5')
with tf.device('/cpu:0'):
    model_cpu = load_model('model1.h5')

In [52]:
x = tf.placeholder(tf.float32, shape=(None, 32,32,3))

In [48]:
c = []
with tf.device('/gpu:0'):
    c.append(model_gpu(x))
with tf.device('/cpu:0'):
    c.append(model_cpu(x))

with tf.device('/cpu:0'):
    predictions = tf.concat(c, 0)

In [53]:
c = []
for dev in ['/gpu:0']: # Add '/gpu:0'
    with tf.device(dev):
        c.append(model(x))
with tf.device('/cpu:0'):
    predictions = tf.concat(c, 0)
    

In [54]:
# Initialize all variables
session = tf.Session()
session.run(tf.global_variables_initializer())

In [35]:
%timeit -n 100 ret = session.run(predictions, feed_dict={x : x_train[1000:2024]})

59.3 ms ± 688 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [40]:
%timeit -n 100 ret = session.run(predictions, feed_dict={x : x_train[1000:2024]})

543 ms ± 8.3 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [50]:
%timeit -n 100 ret = session.run(predictions, feed_dict={x : x_train[1000:2024]})

544 ms ± 9.61 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [55]:
%timeit -n 100 ret = session.run(predictions, feed_dict={x : x_train[1000:2024]})

58.1 ms ± 865 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [97]:
import math
def split_in_batches(data, batch_size):
    ret = []
    start = 0
    end = 0
    num_batches = math.ceil(data.shape[0] / batch_size)
    for batch_num in range(num_batches):
        end += batch_size
        if end > data.shape[0]:  #last batch
            ret.append(data[-batch_size:])
        else:
            ret.append(data[start:end])
        start = end
    return np.array(ret)

In [165]:
def predict_on_device(session, predict_tensor, batches):
    #session = tf.Session()
    #session.run(tf.global_variables_initializer())
    for batch in batches:
        session.run(predict_tensor, feed_dict={x: batch})
        

In [166]:
def split_cpu_gpu(batches, num_batches_cpu, tensor_cpu, tensor_gpu):
    session1 = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    session1.run(tf.global_variables_initializer())
    session2 = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    session2.run(tf.global_variables_initializer())
    
    coord = tf.train.Coordinator()
    
    t_cpu = Thread(target=predict_on_device, args=(session1, tensor_cpu, batches[:num_batches_cpu]))
    t_gpu = Thread(target=predict_on_device, args=(session2, tensor_gpu, batches[num_batches_cpu:]))
    
    t_cpu.start()
    t_gpu.start()
    
    coord.join([t_cpu, t_gpu])
    
    session1.close()
    session2.close()
    

In [157]:
with tf.device('/gpu:0'):
    model_gpu = load_model('model1.h5')
    tensor_gpu = model_gpu(x)
    
with tf.device('/cpu:0'):
    model_cpu = load_model('model1.h5')
    tensor_cpu = model_cpu(x)


In [158]:
batches = split_in_batches(x_train, 128)

In [148]:
len(batches)

391

In [133]:
%%timeit -c 10
split_cpu_gpu(batches, 0, tensor_cpu, tensor_gpu)

7.25 s ± 1.01 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [134]:
%%timeit -c 10
split_cpu_gpu(batches, 20, tensor_cpu, tensor_gpu)

7.8 s ± 576 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [135]:
%%timeit -c 10
split_cpu_gpu(batches, 40, tensor_cpu, tensor_gpu)

8.08 s ± 548 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [167]:
%%timeit -c 5
split_cpu_gpu(batches, 40, tensor_cpu, tensor_gpu)

15.4 s ± 925 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
