# The Discombibleator
## Putting Ancient Measurements Into Context

In [399]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

pd.options.display.max_colwidth = 508
asv_corpus = pd.read_csv("data/t_asv.csv")
asv_corpus = asv_corpus[["id", "t"]]

ModuleNotFoundError: No module named 'nltk.tokenize.moses'

In [382]:
asv_corpus.tail()

Unnamed: 0,id,t
31098,66022017,"And the Spirit and the bride say, Come. And he that heareth, let him say, Come. And he that is athirst, let him come: he that will, let him take the water of life freely."
31099,66022018,"I testify unto every man that heareth the words of the prophecy of this book, if any man shall add unto them, God shall add unto him the plagues which are written in this book:"
31100,66022019,"and if any man shall take away from the words of the book of this prophecy, God shall take away his part from the tree of life, and out of the holy city, which are written in this book."
31101,66022020,"He who testifieth these things saith, Yea: I come quickly. Amen: come, Lord Jesus."
31102,66022021,The grace of the Lord Jesus be with the saints. Amen.


In [383]:
# Code from Andrew at Stack Overflow: https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers
def text2int (textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        #numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

In [384]:
book_key = pd.read_csv("data/key_english.csv")
book_key = book_key[["b", "n"]]
book_key.tail()

Unnamed: 0,b,n
61,62,1 John
62,63,2 John
63,64,3 John
64,65,Jude
65,66,Revelation


In [385]:
measures = pd.read_csv("data/measures.csv", header = 0, index_col = 0, squeeze=True).to_dict()
measures

{'imperial': {'finger': 'inches',
  'handbreadth': 'inches',
  'span': 'inches',
  'cubit': 'inches',
  'long cubit': 'inches',
  'fathom': 'feet',
  'reed': 'feet',
  'furlong': 'feet',
  'stadion': 'feet',
  "sabbath day's journey": 'miles',
  "day's journey": 'miles',
  'gerah': 'ounces',
  'bekah': 'ounces',
  'pim': 'ounces',
  'shekel': 'ounces',
  'mina': 'pounds',
  'talent': 'USD',
  'log': 'pints',
  'kab': 'pints',
  'hin': 'gallons',
  'bath': 'gallons',
  'homer': 'gallons',
  'kor': 'gallons',
  'metretes': 'gallons',
  'omer': 'quarts',
  'seah': 'quarts',
  'ephah': 'gallons',
  'lethech': 'gallons',
  'denarius': 'USD',
  'drachma': 'USD',
  'didrachma': 'USD',
  'the third hour': 'AM',
  'the sixth hour': 'noon',
  'the seventh hour': 'PM',
  'the ninth hour': 'PM',
  'the tenth hour': 'PM',
  'the eleventh hour': 'PM',
  'the second watch': 'midnight',
  'the third watch': '3:00 AM',
  'the fourth watch': '6:00 AM'},
 'imperial_multiplier': {'finger': '0.73',
  'hand

In [386]:
measurement_roots = pd.read_csv('data/measurement_roots.csv', header=None, index_col=1, squeeze=True).to_dict()
measurement_roots

{'alternate': 'root',
 'fingers': 'finger',
 'handbreadths': 'handbreadth',
 'spans': 'span',
 'cubits': 'cubit',
 'long cubits': 'long cubit',
 'fathoms': 'fathom',
 'reeds': 'reed',
 'furlongs': 'furlong',
 'stadia': 'stadion',
 "sabbath day's walk": "sabbath day's journey",
 "days' journey": "day's journey",
 'gerahs': 'gerah',
 'bekahs': 'bekah',
 'pims': 'pim',
 'shekels': 'shekel',
 'minas': 'mina',
 'talents': 'talent',
 'logs': 'log',
 'kabs': 'kab',
 'hins': 'hin',
 'baths': 'bath',
 'homers': 'homer',
 'kors': 'kor',
 'metretas': 'metretes',
 'metrete': 'metretes',
 'omers': 'omer',
 'seahs': 'seah',
 'ephahs': 'ephah',
 'lethechs': 'lethech',
 'denarii': 'denarius',
 'drachmae': 'drachma',
 'didrachmae': 'didrachma',
 'the third hour': 'the third hour',
 'the sixth hour': 'the sixth hour',
 'the seventh hour': 'the seventh hour',
 'the ninth hour': 'the ninth hour',
 'the tenth hour': 'the tenth hour',
 'the eleventh hour': 'the eleventh hour',
 'the second watch': 'the seco

In [387]:
class Book:
    
    def __init__(self, name):
        self.book = name

In [388]:
class Chapter:
    
    def __init__(self, ch):
        self.ch = ch
        
class Verse:
    
    def _init(self, verse):
        self.verse = verse

In [389]:
class Unit_id:
    
    def __init__(self, Book, Chapter, Verse):
        self.book = str(Book)
        self.chapter = str(Chapter).zfill(3)
        self.verse = str(Verse).zfill(3)
        self.book_id_row = book_key.loc[book_key['n'] == Book]
        self.book_id = str(int(self.book_id_row['b']))
        self.id_list = [self.book_id, self.chapter, self.verse]
        self.id_concat = ''.join(self.id_list) 

In [390]:
class Id_to_raw_text:
    
    def __init__(self, id_number):
        self.raw_text_row = asv_corpus.loc[asv_corpus['id'] == int(id_number)]
        self.raw_text = text2int(self.raw_text_row['t'].to_string(index = False))
        self.text_list = word_tokenize(self.raw_text)
        

In [391]:
v = Unit_id('Jeremiah', 52, 21)
r = Id_to_raw_text(v.id_concat)
t = r.raw_text
l = r.text_list
l

['And',
 'as',
 'for',
 'the',
 'pillars',
 ',',
 'the',
 'height',
 'of',
 'the',
 '1',
 'pillar',
 'was',
 '18',
 'cubits',
 ';',
 'and',
 'a',
 'line',
 'of',
 '12',
 'cubits',
 'did',
 'compass',
 'it',
 ';',
 'and',
 'the',
 'thickness',
 'thereof',
 'was',
 '4',
 'fingers',
 ':',
 'it',
 'was',
 'hollow',
 '.']

In [392]:
def pull_out_measurements(tokenized_words): 
    if any(s in tokenized_words for s in measurement_roots.keys()):
        m_words = set(tokenized_words).intersection(measurement_roots.keys())
        for word in m_words:
            tokenized_words[:] = [measurement_roots[word] if x == word else x for x in tokenized_words]

In [393]:
pull_out_measurements(l)
l

['And',
 'as',
 'for',
 'the',
 'pillars',
 ',',
 'the',
 'height',
 'of',
 'the',
 '1',
 'pillar',
 'was',
 '18',
 'cubit',
 ';',
 'and',
 'a',
 'line',
 'of',
 '12',
 'cubit',
 'did',
 'compass',
 'it',
 ';',
 'and',
 'the',
 'thickness',
 'thereof',
 'was',
 '4',
 'finger',
 ':',
 'it',
 'was',
 'hollow',
 '.']

In [394]:
def convert_numbers(tokenized_words):
    if any(s in tokenized_words for s in measurement_roots.values()):
        m_root_words = list(set(tokenized_words).intersection(measurement_roots.values()))
        print(m_root_words)
        for i, j in enumerate(tokenized_words):
            if j in m_root_words:
                bib_num = int(tokenized_words[i-1])
                tokenized_words[i-1] = str(bib_num * float(measures['imperial_multiplier'][j]))
                tokenized_words[i] = (measures['imperial'][j])
    return tokenized_words

In [407]:
convert_numbers(l)
print(l)

['And', 'as', 'for', 'the', 'pillars', ',', 'the', 'height', 'of', 'the', '1', 'pillar', 'was', '324.0', 'inches', ';', 'and', 'a', 'line', 'of', '216.0', 'inches', 'did', 'compass', 'it', ';', 'and', 'the', 'thickness', 'thereof', 'was', '2.92', 'inches', ':', 'it', 'was', 'hollow', '.']


In [425]:
punctuation = [".", ";", ",", "!", "?", "(", ")", ":"]

def compile_sentence(tokenized_words):
    output = "".join([" "+i if not i.startswith("'") and i not in punctuation else i for i in tokenized_words]).strip()
    return output

In [428]:
m = compile_sentence(l)
print(m)

And as for the pillars, the height of the 1 pillar was 324.0 inches; and a line of 216.0 inches did compass it; and the thickness thereof was 2.92 inches: it was hollow.
