In [None]:
import numpy as np
import os, sys
import keras
import keras.backend as K
from PIL import Image
import random
import glob
import pickle
from keras.preprocessing.image
import load_img, img_to_array
from bcolz_array_iterator import BcolzArrayIterator
from tqdm import tqdm
import bcolz
import threading
from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
from time import sleep
%matplotlib inline

def limit_mem():
  K.get_session().close() #new line lets you reset GPU memory without closing notebook
  cfg = K.tf.ConfigProto()
  cfg.gpu_options.allow_growth = True
  K.set_session(K.tf.Session(config=cfg))
limit_mem()

DATA_PATH='/home/bfortuner/workplace/data/imagenet_sample/'
RESIZED_PATH='/home/bfortuner/workplace/data/image_resizing/'

# Save image filenames
fnames = list(glob.iglob(DATA_PATH+'*/*.JPEG'))
pickle.dump(fnames, open(DATA_PATH+'fnamesfullpath.p', 'wb'))
fnames = pickle.load(open(DATA_PATH+'fnamesfullpath.p', 'rb'))

#Randomly Order Filenames (important for later steps w bcolz)
#fnames = np.random.permutation(fnames)
#pickle.dump(fnames, open(DATA_PATH+'fnamesfullpath_random.p', 'wb'))
#fnames = pickle.load(open(DATA_PATH+'fnamesfullpath_random.p', 'rb'))

NEW_SIZE = 72 #h x w to resize to
len(fnames)

### Explore Images

def get_paths_to_files(dir_path):
filepaths = []
for (dirpath, dirnames, filenames) in os.walk(dir_path):
filepaths.extend(os.path.join(dirpath, f) for f in filenames)
return filepaths

def get_random_image_path(dir_path):
filepaths = get_paths_to_files(dir_path)
return filepaths[random.randrange(len(filepaths))]
print (get_random_image_path(DATA_PATH))


tst_img = get_random_image_path(RESIZED_PATH)
load_img(tst_img)

### Technique 1 - Pillow SIMD
* https://pillow.readthedocs.io/en/4.0.x
* https://github.com/uploadcare/pillow-simd
* https://python-pillow.org/pillow-perf/
* Pillow Original - 15.2 seconds per 4000 images
* Pillow-SIMD - 12.7 secs per 4000 images
How much faster is SIMD than Original?
* http://math.stackexchange.com/questions/1227389/what-is-the-difference-between-faster-by-factor-and-faster-by-percent
* (15.2 - 12.7) / 12.7 = ~20% faster
* 20% speedup

# Force uninstall old version, then install pillow-simd\n#$ pip uninstall pillow
#$ CC=\"cc -mavx2\" pip install -U --force-reinstall pillow-simd
#12 seconds per 4000 images
import PIL
print (PIL.PILLOW_VERSION)
%time resize_images(fnames[:4000],NEW_SIZE)


### Technique 2 - Bcolz Array
#New python 3 string substitution
bcolz_file_path = f'{DATA_PATH}trn_resized_{NEW_SIZE}.bc'
bcolz_file_path

arr = bcolz.carray(np.empty((0, NEW_SIZE, NEW_SIZE, 3), 'float32'), chunklen=16, mode='w', rootdir=bcolz_file_path)

#Pre-allocate memory
tl = threading.local()
tl.place = np.zeros((NEW_SIZE,NEW_SIZE,3), 'uint8')

#https://github.com/noamraph/tqdm
def resize_images_bcolz(bc_arr, fnames, new_s):\
for i in tqdm(range(len(fnames))):
f = fnames[i]        
img = resize_img(f, new_s)
elem = app_img(img, new_s)
bc_arr.append(elem)
return bc_arr

#Serial Version
%time resize_images_bcolz(arr,fnames[:4000],NEW_SIZE)
arr.flush()

### Technique 3 - Parallel Processing
#http://masnun.com/2016/03/29/python-a-quick-introduction-to-the-concurrent-futures-module.html
"#Executor.map() example
def add_two(num):
return num+2

def parallel(nums, workers):
with ThreadPoolExecutor(max_workers=10) as executor:
res = executor.map(add_two, nums)
return list(res)
nums = [i for i in range(10000)]
%timeit res = parallel(nums,10)"}
%timeit parallel(nums,100)
