In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

# Helpers

In [None]:
from collections import defaultdict
from skimage.transform import resize
from skimage.util import montage
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
from math import ceil, floor
from scipy import ndimage
import numpy as np
import os, re, glob, json

from helpers import pages

##
# Show a single page image
##

def get_page_image(page_id):
  '''Return a numpy array of a voynichese page image for `page_id`'''
  return ndimage.imread(os.path.join('voynichese', 'images', page_id + '.jpg'))

def show_page(page_id, figsize=(6, 14)):
  '''Show the page image for a given page identifier (e.g.) f1r'''
  plt.figure(figsize=figsize)
  plt.imshow(get_page_image(page_id))

##
# Show frequency of string over course of text
##
  
def plot_string_freq(s, figsize=(14, 1.4), clean=True):
  '''Given a string `s` plot its distribution over pages'''
  y = [get_page_string(i, clean=clean).count(s) for i in page_order]
  x = list(range(len(page_order)))
  plt.figure(figsize=figsize)
  plt.bar(x, y, color='#9ab19d')
  plt.title(s)

def get_page_string(page_id, clean=True):
  '''Return a string of the content from page `page_id` (e.g. f1r)'''
  page_string = ' '.join(pages.get(page_id, []))
  if clean: return clean_string(page_string)
  return page_string

def flatten(arr):
  '''Flatten a 2d array to 1d'''
  return [j for i in arr for j in i]

def get_words(clean=True, unique=False):
  '''Find the set of all words in the corpus'''
  words = flatten([get_page_string(i, clean=clean).split('.') for i in page_order])
  words = [i for i in words if i] # remove empty words
  if unique: return set(words)
  return words

def clean_string(s):
  '''Clean a voynich word string'''
  s = s.replace('!', '').replace('%', '') # replace needless characters
  s = s.replace('-', '.') # replace line breaks with word break (all line breaks assumed to be word breaks)
  s = s.replace('=', '.') # replace end line comments with word break
  s = s.replace(' ', '.') # replace whitespace with word break
  s = re.sub(r'\{[^\{\}]+?\}', '', s) # drop content between {} github.com/viking-sudo-rm/voynich2vec/vms_tokenize.py
  return s

##
# Show all occurrences of a word
##

def get_word_map():
  '''
  Find all occurrences of a word and return a map from:
    d[word][page_id] = [numpy_array_of_word_image, numpy_array_of_word_image]
  '''
  word_map = defaultdict(lambda: defaultdict(list))
  # Find all occurrences of a word
  for page_id in pages:
    page_image = get_page_image(page_id)
    with open(join('voynichese', 'coords', page_id + '.js')) as f:
      words, coords = json.load(f)
      word_list = []
      for word, _, _, _ in words:
        word_list.append(word)
      for word_idx, x, y, w, h in coords:
        cropped = page_image[y:y+h,x:x+w]
        if any([i == 0 for i in cropped.shape]): continue
        word_map[word_list[word_idx]][page_id].append(cropped)
  return word_map

def resize_img(arr, size=(100, 20), anti_aliasing=False):
  return resize(arr, size, anti_aliasing=anti_aliasing)

def show_word_occurrences(word, figsize=(12, 8), grid_shape=None, grayscale=False, skip_verticals=True):
  '''
  Plot a montage of all instances of `word` in the voynich ms
  @arg tuple figsize: the plot size
  @arg tuple grid_shape: the number of rows, cols to include in the montage
  @arg bool grayscale: whether to plot words in grayscale
  @arg bool skip_verticals: skip words with vertical orientation
  '''
  imgs = flatten( [word_map.get(word, [])[page_id] for page_id in word_map.get(word, [])] )
  if skip_verticals: imgs = [i for i in imgs if i.shape[0] < i.shape[1]]
  if not imgs: raise Exception(' ! word has no images')
  size = imgs[0].shape
  resized = np.array([resize_img(i, size=size) for i in imgs])
  if figsize: plt.figure(figsize=figsize)
  composite = montage(resized, multichannel=True, grid_shape=grid_shape)
  if grayscale:
    plt.imshow(rgb2gray(composite), cmap=plt.cm.binary)
  else:
    plt.imshow(composite)

##
# Label each word occurrence
##

def label_word_occurrences(word):
  '''Plot each occurrence of a word with a page and index label'''
  for page_id in word_map.get(word, []):
    for idx, i in enumerate(word_map['shedy'][page_id]):
      plt.figure(figsize=(4,1.4))
      plt.title(page_id + ' ' + str(idx))
      plt.imshow(word_map['shedy'][page_id][idx])
      plt.show()

# Data Exploration

In [None]:
show_page('f1v', figsize=(16,30))

In [None]:
from collections import Counter

words = get_words()
word_counts = Counter(words)
word_map = get_word_map()

for word, _ in word_counts.most_common(5):
  plot_string_freq(word)

In [None]:
show_word_occurrences('daiin', figsize=(30, 30), grayscale=False)

In [None]:
label_word_occurrences('daiin')