In [None]:
import warnings; warnings.filterwarnings('ignore')
from keras.preprocessing.image import load_img, save_img, img_to_array, array_to_img
from keras.applications import Xception, VGG19, InceptionV3, imagenet_utils
from collections import defaultdict
import matplotlib.pyplot as plt
from annoy import AnnoyIndex
import keras.backend as K
import numpy as np
import skimage, matplotlib, scipy, glob, json, shears, random, six, hashlib, sys, functools, math, uuid

# Build Model

In [None]:
SAMPLE_LAYER_INDEX = -2 # default index into XCeption that will determine the image vector shape

# VGG16, VGG19, and ResNet take 224×224 images; InceptionV3 and Xception take 299×299 inputs
def resize_array(arr, shape=(299,299)):
  '''Resize an array to a new shape'''
  im = array_to_img(arr)
  resized = im.resize(shape)
  return img_to_array(resized)/255.0


def get_uuid(*args, dtype='str', max_size=sys.maxsize, right_pad=True):
  '''Helper method that returns a random integer'''
  if args and isinstance(args[0], six.string_types):
    if dtype == 'int': i = str(get_int_hash(args[0]))
    elif dtype == 'str': i = str(uuid.uuid3(uuid.NAMESPACE_URL, args[0]))
  else:
    if dtype == 'int': i = str(random.randint(0, max_size))
    elif dtype == 'str': i = str(uuid.uuid1())
  if dtype == 'int':
    if right_pad:
      while len(i) < len(str(max_size)): i = i + '0'
    return int(i)
  return i


def get_int_hash(s):
  '''Given a string return an integer hash of the string's content'''
  try:
    return int(hashlib.sha1(args[0]).hexdigest(), 16)
  except:
    return int(hashlib.sha1(args[0].encode('utf8')).hexdigest(), 16)

  
def get_image_vectors(img_path):
  '''Given the path to an image, return an ndarray with shape (n_windows_in_img, 1000)'''
  return model.fit_transform(get_image_figures(img_path))


def fit_transform(obj, sample_layer_index=SAMPLE_LAYER_INDEX):
  # input shape must be n_images, h, w, colors: https://keras.io/preprocessing/image/
  if hasattr(obj, 'shape') and len(obj.shape) == 3:
    # this is an array of a single image
    arr = resize_array(obj, shape=(299,299))
    arr = np.expand_dims(arr, axis=0)
  elif hasattr(obj, 'shape') and len(obj.shape) == 4:
    # this is an array of images
    arr = np.array([resize_array(i, shape=(299,299)) for i in obj])
  else:
    # this is assumed to be a single image
    arr = img_to_array(obj.resize((299,299)))/255.0 # see SO 47697622
  # only Xception requires preprocessing
  arr = imagenet_utils.preprocess_input(arr)
  # extract the ith layer from the model (here, the -1th layer, or final layer)
  out = K.function([model.input], [model.layers[sample_layer_index].output])([arr])
  # parse out the k dim vector (k is determined by the size of the layer that's sampled from)
  vec = out[0]
  # return the vector to the calling scope
  return vec


model = Xception(weights='imagenet')
setattr(model, 'fit_transform', fit_transform)

# Image Cleaning

In [None]:
@functools.lru_cache(maxsize=1024)
def load_image(path, color_mode='rgb'):
  '''Given the path to an image return a keras image object'''
  # nb all image loading should call this method rather than load_img
  im = img_to_array(load_img(path, color_mode=color_mode))/255.0
  if color_mode == 'grayscale':
    _im = im.squeeze()
    _im = np.zeros((_im.shape[0], _im.shape[1], 3))
    # replicate identical color information across all channels
    _im[:,:,0] = im
    _im[:,:,1] = im
    if len(im.shape == 3): _im[:,:,2] = im
    im = _im
  im = resize_image(im)
  return im

def resize_image(im, width=1024):
  '''Given an image array, resize that image to a target width while maintaining aspect ratio'''
  h,w = im.shape[:2]
  height = h/w*width
  resized = img_to_array( array_to_img(im).resize( (int(width), int(height) )) )/255.0
  return resized
  
def clean_image(im):
  '''Given an image array return an image array that has standard width and non-significant pixels removed'''
  im = resize_image(im)
  _im = shears.remove_dominant_colors(im, mask_size=0.05, n_colors_to_remove=2)
  _im = shears.filter_img(_im, min_size=1200, connectivity=60) # binarized mask-like
  # create a mask from the remaining dark pixels in _im
  im[np.where(_im==1)] = 1
  return im

# Image Partitioning Methods

In [None]:
@functools.lru_cache(maxsize=1024)
def get_image_windows(img_path, color_mode='rgb'):
  '''Given the path to an image, return an array of the image windows within that image'''
  img = load_image(img_path, color_mode=color_mode)
  arr = img_to_array(img)/255.0
  return get_figures(arr)

@functools.lru_cache(maxsize=1024)
def get_windows(im, step=20, win=100, plot=False):
  '''
  Given a np array `im` with at least two dimensions, return windows of size (`win`, `win`).
  To create the list of windows, remove text, binarize, then use that binarized result as a 
  mask to remove non-significant pixels from the original image. Finally, pass a sliding window
  over the remainder. Only retain the subset of those windows that contain some minimum amount of
  pixels in the mask.
  '''
  windows = []
  dx = 0
  dy = 0
  for _ in range(im.shape[0]//step): # x axis pass
    for _ in range(im.shape[1]//step): # y axis pass
      w = im[dy:dy+win, dx:dx+win]
      if w.shape == (win, win, 3):
        windows.append(w)
      if plot:
        plt.imshow(w)
        plt.title('{}-{}'.format(dx, dy))
        plt.show()
      dy += step
    dx += step
  return np.array(windows)

@functools.lru_cache(maxsize=1024)
def get_image_figures(img_path):
  '''
  Given the path to an image, return an iterable of numpy arrays, with one
  for each "figure" in the image, where a figure represents a single
  object that's figured, or represented (like a plant, or planet)
  ''' 
  orig = resize_image(load_image(img_path)) # get image
  gray = skimage.color.rgb2gray(orig) # grayscale
  o = np.zeros(gray.shape) # binarize
  o[np.where(gray>skimage.filters.threshold_otsu(gray))] = 1 # binarize
  o = shears.filter_img(o, min_size=500, connectivity=60) # remove text
  o = 1-o # reverse figure / ground
  l = [] # initialize list that will hold extracted figures
  for i in skimage.measure.regionprops(skimage.measure.label(o, connectivity=2)): # find figures
    if i.area >= 2000:
      y_min, x_min, y_max, x_max = i.bbox
      # quick gut check to make sure this patch is not abnormally 
      w = x_max-x_min
      h = y_max-y_min
      if (w/h<0.2) or (h/w<0.2): continue
      l.append(orig[y_min:y_max, x_min:x_max])
  return np.array([resize_array(i) for i in l])

# Vectorize Images

In [None]:
import traceback, os

meta_d = {} # d[guid] = {'path': 'collection': 'idx_in_collection'}

img_paths = glob.glob('data/uvm-italian-herbal/images/*.jpg')
collection = 'uvm-italian-herbal'

img_paths = glob.glob('utils/voynichese/images/*.jpg')
collection = 'voynichese'

out_dir = 'npy'

if not os.path.exists(out_dir):
  os.makedirs(out_dir)

if os.path.exists('meta_d.json'):
  meta_d = {k: v for k,v in json.load(open('meta_d.json')).items() }

for idx, i in enumerate(img_paths):
    guid = get_uuid(i)
    windows_path = os.path.join(out_dir, '{}-windows'.format(guid))
    vectors_path = os.path.join(out_dir, '{}-vectors'.format(guid))
    if not os.path.exists(windows_path + '.npy') or not os.path.exists(vectors_path + '.npy'):
      print(' * processing image {} of {} in {}'.format(idx+1, len(img_paths), collection))
      windows = get_image_figures(i)
      vectors = model.fit_transform(windows)
      np.save(windows_path, windows)
      np.save(vectors_path, vectors)
      meta_d[guid] = {'path': i, 'collection': collection, 'idx_in_collection': idx, 'image_type': 'complete'}
  
with open('meta_d.json', 'w') as out:
  json.dump(meta_d, out)

# Index Image Vectors

In [None]:
meta_d = { k: v for k,v in json.load(open('meta_d.json')).items() }
n_dims = 2048
n_trees = 1024

In [None]:
from copy import deepcopy

insertion_idx = 0

# todo quantize vectors for faster i/o
if True:
  t = AnnoyIndex(n_dims, 'angular')
  for vector_path in glob.glob(os.path.join(out_dir, '*-vectors.npy')):
    window_path = vector_path.replace('-vectors', '-windows')
    vectors = np.load(vector_path)
    windows = np.load(window_path)
    # guid
    guid = int(os.path.basename(vector_path).split('-')[0])
    meta = meta_d[guid]
    # process the collection - each vec in vectors is a window on the given page
    for vec_idx, vec in enumerate(vectors):
      sub_guid = uuid()
      meta_d[sub_guid] = deepcopy(meta)
      meta_d[sub_guid].update({'vec_idx': vec_idx, 'insertion_idx': insertion_idx, 'image_type': 'window'})      
      t.add_item(insertion_idx, vec)
      insertion_idx += 1

  t.build(n_trees)
  t.save('voynich.ann')
  
  with open('meta_d.json', 'w') as out: json.dump(meta_d, out)

# Query Index

In [None]:
def get_knn(guid, min_sim=0.7, display=True, query_n=100, result_n=5):
  '''
  Given a GUID, find the insertion_idx for that GUID, use that idx to query the index, then return/display knn
  @args
    guid str: the guid for a subregion of an input image
    min_sim float: the minimum similarity matches must possess to be returned
    display bool: whether to show the matches graphically
    n int: the number of matches to find and report
  '''
  m = meta_d[guid]
  insertion_idx = m.get('insertion_idx', False)
  if not insertion_idx: return [[], []]
  knns, distances = u.get_nns_by_item(insertion_idx, query_n, include_distances=True)
  sims = [1-i for i in distances[1:]] # slice off identity in sims and knns
  knns = [insertion_idx_to_guid[insertion_index] for insertion_index in knns[1:]]
  # filter the knn to those that are out of the query image's group
  c = m['collection']
  mask = [(meta_d[i]['collection'] != c) and (sims[idx]>=min_sim) for idx,i in enumerate(knns)] # determine subset of matches that are out of query image group
  knns = [knns[i] for i,_ in enumerate(knns) if mask[i]]
  sims = [sims[i] for i,_ in enumerate(sims) if mask[i]]
  knns = knns[:result_n]
  sims = sims[:result_n]  
  # plot the matches if requested
  if display and knns:
    guids = [guid] + knns
    sims = [''] + [round(i, 2) for i in sims]
    labels = ['{} {}'.format(sims[idx], guid_to_title(i)) for idx, i in enumerate(guids)]
    imgs = [guid_to_img(i) for i in guids]
    plot_img_grid(imgs, labels=labels)
  return [knns, sims]

def plot_guid(guid, title=None):
  '''Given a GUID plot the image represented by that GUID'''
  window = guid_to_img(guid)
  title = title if title else guid_to_title(guid)
  plt.imshow(window)
  plt.title(title)
  plt.show()

def plot_img_grid(img_list, rows=1, size_scalar=2, labels=[]):
  '''Plot a list of images with `rows` rows'''
  cols = math.ceil( len(img_list) / rows )
  # initialize the plot
  fig, axes = plt.subplots(rows, cols, figsize=(cols*size_scalar, rows*size_scalar), squeeze=False)
  for idx, i in enumerate(img_list):
    col = idx%cols
    row = idx//cols
    axes[row][col].imshow(img_list[idx])
    if len(labels) > idx:
      axes[row][col].set_title(labels[idx], fontsize=6.5)
  plt.show()
  
def guid_to_img(guid):
  '''Given a GUID return a numpy array with shape (n,n,3)'''
  m = meta_d[guid]
  windows = get_image_figures(m['path'])
  return windows[m['vec_idx']]
  
def guid_to_title(guid):
  '''Given a GUID return a title for the image'''
  m = meta_d[guid]
  if m.get('image_type') == 'window':
    title = '{} {} {}'.format(m['collection'], m['idx_in_collection'], m['vec_idx'])
  return title

u = AnnoyIndex(n_dims, 'angular')
u.load('voynich.ann')
# flip the guid to insertion idx; -1 signifies guid is for full image so has no insertion idx
insertion_idx_to_guid = {meta_d[k].get('insertion_idx', -1): k for k in meta_d}

In [None]:
guids = list([i for i in meta_d if meta_d[i]['image_type'] == 'window'])
guids = list([i for i in guids if meta_d[i]['collection'] == 'voynichese'])
print(len(guids))

In [None]:
guids_with_matches = []
for idx, i in enumerate(guids):
  knn, distances = get_knn(i, min_sim=0.8)
  if knn:
    print(idx, i)
    guids_with_matches.append(i)

In [None]:
meta_d[guids[54]]