In [None]:
import warnings; warnings.filterwarnings('ignore')
from keras.preprocessing.image import load_img, save_img, img_to_array, array_to_img
from keras.applications import Xception, VGG19, InceptionV3, imagenet_utils
from collections import defaultdict
import matplotlib.pyplot as plt
from annoy import AnnoyIndex
import keras.backend as K
import numpy as np
import glob, json

# Build Model

In [None]:
SAMPLE_LAYER_INDEX = -2 # default index into XCeption that will determine the image vector shape

# VGG16, VGG19, and ResNet take 224×224 images; InceptionV3 and Xception take 299×299 inputs
def resize_array(arr, shape=(299,299)):
  '''Resize an array to a new shape'''
  im = array_to_img(arr)
  resized = im.resize(shape)
  return img_to_array(resized)


def fit_transform(obj, sample_layer_index=SAMPLE_LAYER_INDEX):
  # input shape must be n_images, h, w, colors: https://keras.io/preprocessing/image/
  if hasattr(obj, 'shape') and len(obj.shape) == 3:
    # this is an array of a single image
    arr = resize_array(obj)
    arr = np.expand_dims(arr, axis=0)
  elif hasattr(obj, 'shape') and len(obj.shape) == 4:
    # this is an array of images
    arr = np.array([resize_array(i) for i in obj])
  else:
    # this is assumed to be a single image
    arr = img_to_array(img.resize((299,299))) # see SO 47697622
  # only Xception requires preprocessing
  arr = imagenet_utils.preprocess_input(arr)
  # extract the ith layer from the model (here, the -1th layer, or final layer)
  out = K.function([model.input], [model.layers[sample_layer_index].output])([arr])
  # parse out the k dim vector (k is determined by the size of the layer that's sampled from)
  vec = out[0]
  # return the vector to the calling scope
  return vec


model = Xception(weights='imagenet')
setattr(model, 'fit_transform', fit_transform)

# Model I/O Helpers

In [None]:
def get_windows(im, step=20, win=100, plot=False):
  '''Given a np array `im` with at least two dimensions, return windows of len `win`'''
  windows = []
  dx = 0
  dy = 0
  for _ in range(im.shape[0]//step): # x axis pass
    for _ in range(im.shape[1]//step): # y axis pass
      dim = im[dy:dy+win, dx:dx+win]
      if dim.shape == (win, win, 3):
        windows.append(dim)
      if plot:
        plt.imshow(dim)
        plt.title('{}-{}'.format(dx, dy))
        plt.show()
      dy += step
    dx += step
  return np.array(windows)

def get_image_windows(img_path, color_mode='rgb'):
  '''Given the path to an image, return an array of the image windows within that image'''
  img = load_img(img_path, color_mode=color_mode)
  arr = img_to_array(img)/255
  return get_windows(arr)

def get_image_vectors(img_path):
  '''Given the path to an image, return an ndarray with shape (n_windows_in_img, 1000)'''
  return model.fit_transform(get_image_windows(img_path))

# Investigate feature separation given sample layer index

In [None]:
import scipy
from itertools import combinations
from multiprocessing import Pool

img_paths = glob.glob('utils/voynichese/images/*.jpg')
im = img_paths[0]
n = 40
rows = int(n/10)
columns = 10
size_scalar = 4

# get this image's windows
windows = get_image_windows(im)

def process_layer_index(layer_index):
  print(' * processing layer', layer_index)
  # get this image'svectors
  vectors = model.fit_transform(windows, layer_index)

  fig, axes = plt.subplots(rows, columns, figsize=(columns*size_scalar,rows*size_scalar))
  for odx, o in enumerate(vectors[:n]):
    o = o.flatten().squeeze()
    column = odx%columns
    row = odx//columns
    axes[row][column].scatter(range(len(o)), o, s=0.01)
  plt.savefig('layer-index-{}-scatter.png'.format(layer_index))
  plt.clf()
  
  # get the histogram of vector similarities
  distances = []
  for i, j in combinations(vectors, 2):
    i = i.flatten().squeeze()
    j = j.flatten().squeeze()
    d = scipy.spatial.distance.cosine(i, j)
    distances.append(d)
  plt.hist(distances, bins=50)
  plt.savefig('layer-index-{}-hist.png'.format(layer_index))
  plt.clf()
  
  # find the aggregate distance of all windows
  d = np.sum(distances)
  print(' * aggregate distance', layer_index, d)
  print(' * dims size', layer_index, o.shape)
  return [len(o), d]
  
if False:
  distances = {}
  shapes = {}
  for i in range(1, 21, 1):
    #i -= 20
    try:
      shape, distance = process_layer_index(i)
      distances[i] = distance
      shapes[i] = shape
    except:
      print(' * could not process idx', i)

  with open('analysis-results.json', 'w') as out:
    json.dump({
      'distances': distances,
      'shapes': shapes,
    }, out)

# Vectorize Images

In [None]:
import traceback, os

img_paths = glob.glob('utils/voynichese/images/*.jpg')
color_mode = 'grayscale'

out_dir = 'npy'
if not os.path.exists(out_dir):
  os.makedirs(out_dir)

for idx, i in enumerate(img_paths[:3]):
  try:
    windows_path = os.path.join(out_dir, 'windows#' + i.replace('/', '#') + '#' + str(idx))
    vectors_path = os.path.join(out_dir, 'vectors#' + i.replace('/', '#') + '#' + str(idx))
    if not os.path.exists(windows_path + '.npy') or not os.path.exists(vectors_path + '.npy'):
      print(' * processing image', idx+1)
      windows = get_image_windows(i, color_mode=color_mode)
      vectors = model.fit_transform(windows)
      np.save(windows_path, windows)
      np.save(vectors_path, vectors)
  except Exception as exc:
    print(' * could not parse', i, traceback.print_exc())

# Index Image Vectors

In [None]:
# vec_d[collection_name][image_idx][vec_idx] = array with shape (1,1000)
# img_d[collection_name][image_idx][vec_idx] = array with shape (win, win, 3)
vec_d = defaultdict(lambda: defaultdict(lambda: defaultdict()))
img_d = defaultdict(lambda: defaultdict(lambda: defaultdict()))

idx_to_meta = {}
counter_idx = 0

n_dims = 2048
n_trees = 1000

In [None]:
t = AnnoyIndex(n_dims, 'angular')
for vector_path in glob.glob(os.path.join(out_dir, 'vectors*.npy')):
  window_path = vector_path.replace('vectors#', 'windows#')
  vectors = np.load(vector_path)
  windows = np.load(window_path)
  # determine the collection and path index for this path
  s = os.path.basename(vector_path).split('#')
  collection = '#'.join(s[:-1])
  image_idx = int(s[-1].split('.')[0])
  # process the collection - each vec in vectors is a window on the given page
  for vec_idx, vec in enumerate(vectors):
    idx_to_meta[counter_idx] = {'collection': collection, 'img_idx': image_idx, 'vec_idx': vec_idx}
    vec_d[collection][image_idx][vec_idx] = vec.tolist()
    img_d[collection][image_idx][vec_idx] = windows[vec_idx].tolist()
    
    t.add_item(counter_idx, vec_d[collection][image_idx][vec_idx])
    counter_idx += 1
    
t.build(n_trees)
t.save('voynich.ann')

In [None]:
with open('vec_d.json', 'w') as out: json.dump(vec_d, out)
with open('img_d.json', 'w') as out: json.dump(img_d, out)
with open('idx_to_meta.json', 'w') as out: json.dump(idx_to_meta, out)

# Query Index

In [None]:
def image_idx_to_vec(idx):
  '''Given an image idx return an image array'''
  m = idx_to_meta[idx]
  return img_d[ m['collection'] ][ m['img_idx'] ][ m['vec_idx'] ]
  
def get_knn(idx, display=True, n=10):
  im = image_idx_to_vec(idx)  
  knn, distances = u.get_nns_by_item(idx, n, include_distances=True)
  knn = knn[1:]
  distances = distances[1:]
  if display:
    plt.imshow(im)
    plt.show()
    for i in knn:
      im = image_idx_to_vec(i)
      plt.imshow(im)
      plt.show()
  return knn, distances

In [None]:
u = AnnoyIndex(n_dims, 'angular')
u.load('voynich.ann') # super fast, will just mmap the file

In [None]:
knn, distances = get_knn(122, n=10, display=True)
sims = [1-d for d in distances]
sims