In [None]:
import warnings; warnings.filterwarnings('ignore')
from keras.preprocessing.image import load_img, save_img, img_to_array, array_to_img
from keras.applications import imagenet_utils, VGG19 #Xception, InceptionV3, MobileNetV2
from scipy.misc import imread, imsave, imresize
from uuid import NAMESPACE_URL, uuid1, uuid3
import matplotlib.patches as patches
from keras.models import load_model
from skimage.color import rgb2gray
from traceback import format_exc
import matplotlib.pyplot as plt
from functools import lru_cache
from annoy import AnnoyIndex
from six import string_types
from random import randint
try:
  import keras.backend as K
except:
  from tensorflow.keras import backend as K
from copy import deepcopy
from hashlib import sha1
from sys import maxsize
from math import ceil
import numpy as np
import skimage
import glob, json, shears, os, shutil

This pipeline is founded on the idea that each image is composed of 0 or more "figures",
where each figure is some semantic unit within the image (e.g. a single plant, or part of a plant).

# Helpers

In [None]:
##
# NN MODEL
##

# VGG16, VGG19, and ResNet take 224×224 images; InceptionV3 and Xception take 299×299 inputs
def resize_array(arr, shape=(304,304)):
  '''Resize an array to a new shape'''
  im = array_to_img(arr)
  resized = im.resize(shape)
  return img_to_array(resized)/255.0


def get_uuid(*args, dtype='str', max_size=maxsize, right_pad=True):
  '''Helper method that returns a random integer'''
  if args and isinstance(args[0], string_types):
    if dtype == 'int': i = str(get_int_hash(args[0]))
    elif dtype == 'str': i = str(uuid3(NAMESPACE_URL, args[0]))
  else:
    if dtype == 'int': i = str(randint(0, max_size))
    elif dtype == 'str': i = str(uuid1())
  if dtype == 'int':
    if right_pad:
      while len(i) < len(str(max_size)): i = i + '0'
    return int(i)
  return i


def get_int_hash(s):
  '''Given a string return an integer hash of the string's content'''
  try:
    return int(sha1(args[0]).hexdigest(), 16)
  except:
    return int(sha1(args[0].encode('utf8')).hexdigest(), 16)


##
# IMAGE CLEANING
##

@lru_cache(maxsize=0)
def load_image(path, color_mode='rgb'):
  '''Given the path to an image return a keras image object'''
  # nb all image loading should call this method rather than load_img
  im = img_to_array(load_img(path, color_mode=color_mode))/255.0
  if color_mode == 'grayscale': im = img_to_grayscale(im)
  im = resize_image(im)
  return im


def resize_image(im, width=1024):
  '''Given an image array, resize that image to a target width while maintaining aspect ratio'''
  h,w = im.shape[:2]
  height = h/w*width
  resized = img_to_array( array_to_img(im).resize( (int(width), int(height) )) )/255.0
  return resized
  
  
def clean_image(im):
  '''Given an image array return an image array that has standard width and non-significant pixels removed'''
  im = resize_image(im)
  _im = shears.remove_dominant_colors(im, mask_size=0.05, n_colors_to_remove=2)
  _im = shears.filter_img(_im, min_size=1200, connectivity=60) # binarized mask-like
  # create a mask from the remaining dark pixels in _im
  im[np.where(_im==1)] = 1
  return im


##
# IMAGE PARTITIONING
##

@lru_cache(maxsize=0)
def get_image_figures(img_path, grayscale=True, resize_figures=True, min_area=10000):
  '''
  Given the path to an image, return an iterable of numpy arrays, with one
  for each "figure" in the image, where a figure represents a single
  object that's figured, or represented (like a plant, or planet)
  ''' 
  orig = resize_image(load_image(img_path)) # get image
  gray = skimage.color.rgb2gray(orig) # grayscale
  o = np.zeros(gray.shape) # binarize
  o[np.where(gray>skimage.filters.threshold_otsu(gray))] = 1 # binarize
  o = shears.filter_img(o, min_size=500, connectivity=60) # remove text
  o = 1-o # reverse figure / ground
  l = [] # initialize list that will hold extracted figures
  bboxes = [] # initialize list that will hold bounding boxes
  for i in skimage.measure.regionprops(skimage.measure.label(o, connectivity=2)): # find figures
    if i.area >= min_area:
      y_min, x_min, y_max, x_max = i.bbox
      # quick gut check to make sure this patch is not abnormally shaped
      w = x_max-x_min
      h = y_max-y_min
      if (w/h<0.2) or (h/w<0.2): continue
      img = img_to_grayscale(orig) if grayscale else orig
      # add the figure to the array of figures
      l.append(img[y_min:y_max, x_min:x_max])
      # get the positions of the bounding box for this figure 0:1 in w,h
      height, width = o.shape
      bboxes.append([y_min/height, y_max/height, x_min/width, x_max/width])
  if resize_figures:
    return bboxes, np.array([resize_array(i) for i in l])
  return bboxes, np.array([i for i in l])


@lru_cache(maxsize=0)
def get_image_windows(img_path, color_mode='rgb'):
  '''Given the path to an image, return an array of the image windows within that image'''
  img = load_image(img_path, color_mode=color_mode)
  arr = img_to_array(img)/255.0
  windows = get_windows(arr)
  return windows


def get_windows(im, step=20, win=300, n_per_side=20, plot=False):
  '''
  Given a np array `im` with at least two dimensions, return windows of size (`win`, `win`).
  To create the list of windows, remove text, binarize, then use that binarized result as a 
  mask to remove non-significant pixels from the original image. Finally, pass a sliding window
  over the remainder. Only retain the subset of those windows that contain some minimum amount of
  pixels in the mask.
  '''
  s = win+(step*n_per_side)
  im = resize_array(im, shape=(s,s))
  windows = []
  dx = 0
  dy = 0
  for _ in range(im.shape[0]//step): # x axis pass
    for _ in range(im.shape[1]//step): # y axis pass
      w = im[dy:dy+win, dx:dx+win]
      if w.shape == (win, win, 3):
        windows.append(w)
      if plot:
        plt.imshow(w)
        plt.title('{}-{}'.format(dx, dy))
        plt.show()
      dy += step
    dx += step
  return np.array(windows)


@lru_cache(maxsize=0)
def guid_to_vecs(guid):
  '''Given a guid for a full image, get the array of vectors that corresponds to that image'''
  vec_path = os.path.join(out_dir, '{}-vectors.npy'.format(guid))
  try:
    vecs = np.load(vec_path)
    if np.any(vecs): return vecs
  except:
    print(' ! could not load vecs for guid', guid)
  return []


@lru_cache(maxsize=0)
def guid_to_vec(guid):
  '''Given a guid for a figure, get the vector that corresponds to that figure'''
  m = meta_d[guid]
  return guid_to_vecs(m['parent_guid'])[m['vec_idx']]

##
# PLOTING
##

def plot_img_grid(img_list, rows=1, size_scalar=2, labels=[]):
  '''Plot a list of images with `rows` rows'''
  cols = ceil( len(img_list) / rows )
  # initialize the plot
  fig, axes = plt.subplots(rows, cols, figsize=(cols*size_scalar, rows*size_scalar), squeeze=False)
  for idx, i in enumerate(img_list):
    col = idx%cols
    row = idx//cols
    axes[row][col].imshow(img_list[idx])
    if len(labels) > idx:
      axes[row][col].set_title(labels[idx], fontsize=6.5)
  plt.show()

  
def plot_guid(guid, title=None):
  '''Given a GUID plot the image represented by that GUID'''
  window = guid_to_img(guid)
  title = title if title else guid_to_title(guid)
  plt.imshow(window)
  plt.title(title)
  plt.show()
  
  
def guid_to_img(guid):
  '''Given a GUID return a numpy image'''
  if meta_d['guid'].get('parent_guid', False):
    parent_guid = meta_d['guid']['parent_guid']
    windows = get_image_figures(meta_d[parent_guid]['path'])
    return windows[guid][meta_d['guid']['vec_idx']]
  else:
    return img_to_arry(load_image(meta_d[guid]['path']))
  
  
def guid_to_title(guid):
  '''Given a GUID return a title for the image'''
  m = meta_d[guid]
  if not m.get('parent_guid', False):
    return m.get('path')
  return '{} {} {}'.format(m['collection'], m['idx_in_collection'], m['vec_idx'])


def img_to_grayscale(im):
  '''Given an image with shape h,w,3, return the image with the same shape in grayscale'''
  im = rgb2gray(im).squeeze()
  _im = np.zeros((im.shape[0], im.shape[1], 3))
  # replicate identical color information across all channels
  _im[:,:,0] = im
  _im[:,:,1] = im
  _im[:,:,2] = im
  return _im

# Load Images

Each image is a jpg file that may contain 0 or more figures. This pipeline starts by loading all image files to be processed.

In [None]:
images = glob.glob('utils/voynichese/images/*.jpg')

In [None]:
images += glob.glob('data/*/images/*.jpg') # load other collections... 

In [None]:
# to expedite research efforts, it's helpful to limit the number of images to be processed.
# to do so, set the following conditional to True
# the np.random.seed makes the random image selection reproducible
if False:
  np.random.seed(0)
  np.random.shuffle(images)
  images = images[:20]

In [None]:
# create a 2D list (list of lists) wherein each sublist contains all figures for the ith image in `images`
# create a 2D list of bounding boxes as well, where each sublist contains the y_min, y_max, x_min, x_max
# coordinates in 0:1 space for the ith image in `images`
figures = []
bboxes = []
for i in images:
  try:
    bbox, figure = get_image_figures(i)
    bboxes.append(bbox)
    figures.append(figure)
  except:
    print(' ! could not process', i)
    figures.append([])

In [None]:
# plot a histogram of the number of figures in each image
plt.hist([len(i) for i in figures])
plt.title('Counts of Figures Per Input Image')
plt.xlabel('Number of Figures')
plt.ylabel('Count')
plt.show()

To allow us to map figures back to their parent images in what follows, the next codeblock flattens `figures` to a dictionary that has keys in the form of `a-b`, where `a` denotes index position of a figure's parent image among all images, and `b` denotes the index of the figure within all figures in that image

In [None]:
fig_map = {} # d[figure index] = [[image index among all images, figure index within image], [], ...]
path_map = {}
k = 0
for idx, i in enumerate(figures):
  for jdx, j in enumerate(i):
    fig_map[k] = '{}-{}'.format(idx, jdx)
    path_map[k] = images[idx]
    k += 1

In [None]:
# flatten `figures` to an array with shape (n_figures, resized_h, resized_w, 3)
# the final dimension indicates each image has 3 color channels, while resized_h and
# resized_w are dictated by the resize_array() function called by get_image_figures() above
imgs_color = np.array([j for i in figures for j in i])

# Save each figure to disk

Each image contains multiple figures -- save each to disk

In [None]:
# create the directories in which output images and resized thumbs will be stored
for i in ['images', 'thumbs']:
  out_dir = os.path.join('output', i)
  if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# store a map from the index position of each figure among all figures to the
# path of the image file in which that figure is illustrated
fig_idx_to_path = {}

# save the full-sized version of each figure to disk
# his entails drawing a black box around the given figure within its parent image
# some figures will contain the entirety of their parent image...
for idx, i in enumerate(sorted(fig_map.keys())):
  img_idx, fig_idx = [int(j) for j in fig_map[i].split('-')]
  img_path = images[img_idx]
  bn = os.path.basename(img_path)
  bbox = bboxes[img_idx][fig_idx]
  # read the image
  im = imread(img_path)
  # get image h, w
  scale_h, scale_w = im.shape[:2]
  # create a plot that will use the image to fill the entire frame
  fig = plt.figure(frameon=False)
  fig.set_size_inches(scale_w/100,scale_h/100)
  # make the axes fill the figure
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  # add the image to the plot
  ax.imshow(im)
  # determine the bounding box size for this figure
  y1, y2, x1, x2 = bbox
  w = (x2-x1) * scale_w
  h = (y2-y1) * scale_h
  x = x1 * scale_w
  y = y1 * scale_h
  # draw a rectangle indicating figure size on parent image
  rect = patches.Rectangle((x,y),w,h,linewidth=1,edgecolor='black',facecolor='none')
  # add the patch to the Axes
  ax.add_patch(rect)
  # save the resulting image with box to disk  
  out_file = bn.replace('.jpg', '-' + str(fig_idx) + '.jpg')
  out_path = os.path.join('output', 'images', out_file)
  plt.savefig(out_path)
  print(' * saved', out_path)
  plt.close()
  # store the mapping from this figure's index position to the figure's representation
  # in a file on disk
  fig_idx_to_path[idx] = out_path

In [None]:
def resize_img(image_path, min_width=200, min_height=160):
  '''
  Given the path to an image, resize it to a minimum width and height, then
  return that resized image
  '''
  img = imread(image_path)
  shape = img.shape
  if len(shape) == 2:
    shape = (shape[0], shape[1], 1)
  h, w, c = shape # height, width, color
  if h > w:
    height = int(h * (min_width / w))
    width = min_width
  else:
    height = min_height
    width = int(w * (min_height / h))
  resized = imresize(img, (height, width))
  return resized

# resize each image we just created in './images' to a smaller size within './thumbs'
for i in glob.glob(os.path.join('output', 'images', '*')):
  bn = os.path.basename(i) 
  imsave(os.path.join('output', 'thumbs', bn), resize_img(i))

# Clear up memory

In [None]:
import gc

# to free up some memory, we can delete objects that are no longer used,
# then evoke the garbage collector `gc`
del figures

# one can also clear the cache on lru decorated functions to free up memory
load_image.cache_clear()
get_image_figures.cache_clear()
get_image_windows.cache_clear()

gc.collect()

# Image Vectorization

Next let's transform each image into a vector. This notebook contains three methods we can use to achieve this goal:

1. We can flatten each input image into a vector
2. We can sample from a hidden layer in a pretrained network
3. We can build our own network and vectorize images with that network

In [None]:
# identify the vectorization method to use. Valid options are 'raw', 'pretrained', or 'autoencoder'
vectorization_method = 'pretrained'

### Raw Image Vectorization

In [None]:
# simply use the raw image objects as image vectors
# this will mean using pixel-level deltas between images
# to identify similar images
# (this is the least sophisticated approach to image vectorization, but
# it clarifies what is happening when we use more complex methods to vectorize
# images below)
if vectorization_method == 'raw':
  z = X = np.array([i.flatten() for i in imgs_color])

### Vectorize with Pretrained Network

In [None]:
if vectorization_method == 'pretrained':
  # specify the pretrained model to use and identify the size of its internal layers
  # note that different models are optimized for different input image shapes
  #model = Xception()
  #model = MobileNetV2()
  model = VGG19()
  
  # indicate the size of each internal hidden layer in the selected network
  for idx, i in enumerate(model.layers):
    shape = i.output_shape[1:]
    vals = np.prod(shape)
    print(idx, vals)
    
  # VGG16, VGG19, and ResNet take 224×224 images; InceptionV3 and Xception take 299×299 inputs
  X = np.array([resize_array(i, shape=(224,224)) for i in imgs_color])
  # only xception requires img preprocessing
  if model.name == 'xception': X = imagenet_utils.preprocess_input(X)

In [None]:
if vectorization_method == 'pretrained':
  # select the index of the layer to use when vectorizing images
  # nb: different layers have different shapes! The layer from which one
  # samples has a huge influence on the kinds of image similarities detected
  # by the model
  sample_layer_index = 6

  # create a vectorization function that extracts the output from the `sample_layer_index` layer
  vectorize = K.function([model.input], [model.layers[sample_layer_index].output])

  # to process the entire dataframe in one shot, set the following to True
  # and flip the boolean immediately below to False
  if False:
    z = vectorize(X)
    z = np.array([i.flatten() for i in z])

  # to process the dataframe figures one by one and log the progress as you go,
  # set the following to True and the boolean above to False
  if True:
    vecs = []
    for idx, i in enumerate(X):
      query_img = np.expand_dims(i, 0)
      vec = vectorize([query_img])[0]
      vecs.append(vec)
      print(' * vectorized', idx+1, 'of', len(X), 'frames')
    z = np.array([i.flatten() for i in vecs])
    
  # scale X 0:1 so images can be visualized below
  X = rgb2gray(X)
  X = X-np.min(X)
  X /= np.max(X)

In [None]:
# optionally, to avoid having to recompute z, X, images, and fig_map, one can cache
# these data objects by setting the following boolean to True
if vectorization_method == 'pretrained' and False:
  np.save('pretrained-z',z)
  np.save('pretrained-X',X)
  np.save('pretrained-images',images)
  json.dump(fig_map, open('pretrained-fig-map.json', 'w'))

### Vectorize with a Custom CNN

In [None]:
# Here we'll compose a custom Convolutional Autoencoder to vectorize our input images
from keras.models import Model
from keras.layers import Input, Reshape, Dense, Flatten, Conv2D, MaxPooling2D, UpSampling2D

conv1 = (6,6)
conv2 = (5,5)
conv3 = (4,4)
conv4 = (4,4)

pool1 = (2,2)
pool2 = (2,2)
pool3 = (2,2)
pool4 = (2,2)

class Autoencoder:
  def __init__(self, img_shape=(304, 304, 1)):
    if not img_shape: raise Exception('Please provide img_shape (height, width) in px')

    # create the encoder
    i = h = Input(img_shape) # the encoder takes as input images    
    h = Conv2D(16, conv1, activation='relu', padding='same')(i)
    h = MaxPooling2D(pool1, padding='same')(h)
    h = Conv2D(8, conv2, activation='relu', padding='same')(h)
    h = MaxPooling2D(pool2, padding='same')(h)
    h = Conv2D(8, conv3, activation='relu', padding='same')(h)
    h = MaxPooling2D(pool3, padding='same')(h)
    h = Conv2D(4, conv4, activation='relu', padding='same')(h)
    h = MaxPooling2D(pool4, padding='same')(h)
    self.encoder = Model(inputs=[i], outputs=[h])

    # create the decoder
    i = h = Input((19, 19, 4))
    h = Conv2D(4, conv4, activation='relu', padding='same')(h)
    h = UpSampling2D(pool4)(h)
    h = Conv2D(8, conv3, activation='relu', padding='same')(h)
    h = UpSampling2D(pool3)(h)
    h = Conv2D(8, conv2, activation='relu', padding='same')(h)
    h = UpSampling2D(pool2)(h)
    h = Conv2D(16, conv1, activation='relu', padding='same')(h)
    h = UpSampling2D(pool1)(h)
    h = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(h)
    self.decoder = Model(inputs=[i], outputs=[h])

    # combine the encoder and decoder into a full autoencoder
    i = Input(img_shape) # take as input image vectors
    z = self.encoder(i) # push observations into latent space
    o = self.decoder(z) # project from latent space to feature space
    self.model = Model(inputs=[i], outputs=[o])
    self.model.compile(loss='mse', optimizer='adam')

autoencoder = Autoencoder()

# inspect the model
autoencoder.encoder.summary()
autoencoder.decoder.summary()
autoencoder.model.summary()

In [None]:
if vectorization_method == 'autoencoder':
  # this autoencoder expects a single color channel (i.e. grayscale inputs)
  X = np.expand_dims(np.array([rgb2gray(i) for i in imgs_color]), 3)

In [None]:
if vectorization_method == 'autoencoder':
  # train the autoencoder - for production purposes run more epochs, ~100 is a decent start.
  # once the loss term ceases to fall much over many iterations,
  # you can lower the learning rate and train more...
  autoencoder.model.fit(X, X, batch_size=256, epochs=100)

In [None]:
model = autoencoder.encoder

if vectorization_method == 'autoencoder':
  # optionally, save or load a model -- this is useful if you run lots of training
  if True:
    autoencoder.model.save('voynich.hdf5')
  if False:
    model = load_model('voynich.hdf5')

In [None]:
if vectorization_method == 'autoencoder':
  # test the autoencoder's reconstruction of sample inputs
  # the closer the outputs are to the input, the better the model
  # has learned the data's features

  # select the index of the figure to test
  sample_idx = 131

  # plot the figure
  plt.imshow(X[sample_idx].squeeze())
  plt.show()

  # plot the model's reconstruction of the figure
  o = model.predict(np.expand_dims(X[sample_idx], 0))
  plt.imshow(o.squeeze())

In [None]:
# encode each figure - i.e. push each into the latent space
if vectorization_method == 'autoencoder':
  z = autoencoder.encoder.predict(X)
  z = np.array([i.flatten() for i in z])

In [None]:
# this is another opportunity to cache model outputs to save compute time in future runs
if vectorization_method == 'autoencoder':
  np.save('autoencoder-z',z)
  np.save('autoencoder-X',X)
  np.save('autoencoder-images',images)
  json.dump(fig_map, open('autoencoder-fig-map.json', 'w'))

# Load data artifacts from cache

In [None]:
# optionally, one can load saved data objects that were saved above
vectorization_method = 'pretrained'

if False:
  z = np.load(vectorization_method + '-z.npy')
  X = np.load(vectorization_method + '-X.npy')
  images = np.load(vectorization_method + '-images.npy')
  fig_map = json.load(open(vectorization_method + '-fig-map.json'))
  fig_map = {int(k): v for k,v in fig_map.items()}

# Metadata Prep

In what follows, we'll create the MongoDB database tables for the Neural Neighbors application. This database uses two tables. The records in those tables take the forms illustrated below:

<b>metadata</b>:
```
"permalink" : "http://cdi.uvm.edu/.../datastream/JPG/view",
"collection" : "UVM Italian Herbal",
"date" : 1550"",
"title" : "http://cdi.uvm.edu/.../datastream/TN/view",
"location" : "UVM",
"artist" : "Artist's name would go here",
"has_knn" : 1, # bool showing whether image has knn from another collection
"image" : "015ff675-de31-11e9-a0eb-f45c89b66fa9.jpg",
"text" : "Metadata for this page would go here"
```

<b>knn</b>:
```
"image" : "015ff675-de31-11e9-a0eb-f45c89b66fa9.jpg",
"knn": [list of metadata objects, each augmented with a "similarity" float value]
```

In [None]:
# let's load all extant metadata to prepare to save data in this format.
img_to_meta = {}
for i in images:
  bn = os.path.basename(i)
  meta_path = i.replace('/images/', '/metadata/').replace('.jpg', '.json')
  j = json.load(open(meta_path))
  img_to_meta[ bn ] = j

In [None]:
from copy import deepcopy

# map each figure to the metadata for its parent image
fig_to_meta = {} # d[figure image basename] = {metadata for figure}
for i in glob.glob('output/images/*.jpg'):
  bn = os.path.basename(i)
  parent_bn = '-'.join(bn.split('-')[:-1]) + '.jpg'
  fig_to_meta[bn] = deepcopy(img_to_meta[parent_bn])
  fig_to_meta[bn].update({
    'image': bn,
    'title': bn,
  })

# Identify KNN & Store in DB

Whichever vectorization method we use, let's now identify the k-nearest neighbors for each of our figures.

In [None]:
# z.shape = (number of figures, dimensions in each figure)
z.shape

In [None]:
# Build a KNN index
from annoy import AnnoyIndex

# if True will build a new KNN index, else we'll load from cache
build = True

# increasing n_trees takes more time and memory but produces better results
n_trees = 1000

if build:
  t = AnnoyIndex(len(z[0].flatten()), 'angular')
  for idx, i in enumerate(z):
    t.add_item(idx, i.flatten())
  _ = t.build(n_trees)
  t.save('voynich.ann')
  
else:
  t = AnnoyIndex(len(z[0].flatten()), 'angular')
  model = t.load('voynich.ann')

In [None]:
# number of nearest neighbors to find (nb: the first retrieved knn should be the query image itself)
k = 100

# bool indicating whether to plot the knn for each image
plot = False

# minimum similarity two images must have to be called a match for out of collection knn identification
min_sim = 0.8

# boolean indicating whether to retain out of collection matches in knn

# d[img_basename] = [[img_basename, sim], [img_basename, sim], ...]
knn_map = {}

for i in fig_idx_to_path:
  bn = os.path.basename(fig_idx_to_path[i])
  # store knn mapping data
  knn, dist = t.get_nns_by_item(i, k, include_distances=True)
  # convert distances to similarities
  sims = [round(1-i, 4) for i in dist]
  # get the figure indices of the matching knn
  bns = [os.path.basename(fig_idx_to_path[j]) for j in knn]
  # determine whether there are any out of collection matches with sufficiently high similarity
  collections = [fig_to_meta[j]['collection'] for j in bns]
  q_collection = fig_to_meta[bn]['collection']
  # only retain out of collection matches above a certain threshold in the knn
  filtered_knn = []
  filtered_sims = []
  for idx, i in enumerate(knn):
    if sims[idx]>min_sim and collections[idx] != q_collection:
      filtered_knn.append(knn[idx])
      filtered_sims.append(sims[idx])
  ooc_matches = 1 if any(filtered_knn) else 0
  # update the fig_to_meta map to indicate whether this img has ooc matches
  fig_to_meta[bn].update({'has_ooc_math': ooc_matches})
  # update the fig_to_meta map to indicate whether this image has knn
  fig_to_meta[bn].update({'has_knn': ooc_matches})
  # augment the knn map to indicate nearest matches and sims for `i`
  knn_map[bn] = list(zip(filtered_knn[:20], filtered_sims[:20]))
  # plot the knn if requested
  if plot:
    # plot the knn for `i`
    img_list = [X[j].squeeze() for j in knn] # show the most similar image to the input img
    # curate titles for each match
    titles = []
    img_idx, fig_idx = zip(*[[int(j) for j in fig_map[k].split('-')] for k in knn])  
    # display the figure offset, similarity, and image path for all matches
    print('\n'.join(['{} {} {}'.format(knn[idx], sims[idx], images[i]) for idx, i in enumerate(img_idx)]))
    plot_img_grid(img_list, labels=sims)

# Seed Database

In [None]:
from pymongo import MongoClient

# when using the cluster, save to local disk, else save to mongo
using_cluster = True

if not using_cluster:
  # identify the name of the database in which to save the resulting data
  db = MongoClient().neuralneighbors
  # remove all records from the extant db tables
  print(db.knn.remove({}))
  print(db.metadata.remove({}))

# save each figure that has matches to the database
saved = 0
for i in knn_map:
  if not knn_map[i]: continue
  bns, sims = zip(*knn_map[i])
  knn_vals = [fig_to_meta[j] for j in bns]
  for jdx, j in enumerate(knn_vals):
    j['similarity'] = sims[jdx]
  # save to mongo if not using cluster
  knn_data = {'image': i, 'knn': knn_vals}
  metadata_data = fig_to_meta[i]
  if not using_cluster:
    db.knn.insert(knn_data)
    db.metadata.insert(metadata_data)
  else:
    if not os.path.exists('knn'): os.makedirs('knn')
    if not os.path.exists('metadata'): os.makedirs('metadata')
    with open(os.path.join('knn', i), 'w') as out: json.dump(knn_data, out)
    with open(os.path.join('metadata', i), 'w') as out: json.dump(metadata_data, out)
  saved += 1

  # indicate the total number of matches found
  print(' * found', saved, 'matches')

# create indices for the tables to expedite query times
db.knn.create_index('image')
db.metadata.create_index('image')

In [None]:
# once the data is saved to local disk on cluster, we can use the following
# to port the data into a mongo db (e.g. on Cuda / Heroku)

db = MongoClient().neuralneighbors
# remove all records from the extant db tables
print(db.knn.remove({}))
print(db.metadata.remove({}))

for i in glob.glob(os.path.join('knn', '*')): db.knn.insert(json.load(open(i)))
for i in glob.glob(os.path.join('metadata', '*')): db.metadta.insert(json.load(open(i)))

In [None]:
# now just move ./output to the location of the neural-neighbors root directory
# and run the Docker / Heroku commands to deploy the application!