# The Discombibleator
## Putting Ancient Measurements Into Context

In [719]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

pd.options.display.max_colwidth = 508
asv_corpus = pd.read_csv("data/t_asv.csv")
asv_corpus = asv_corpus[["id", "t"]]

In [720]:
asv_corpus.tail()

Unnamed: 0,id,t
31098,66022017,"And the Spirit and the bride say, Come. And he that heareth, let him say, Come. And he that is athirst, let him come: he that will, let him take the water of life freely."
31099,66022018,"I testify unto every man that heareth the words of the prophecy of this book, if any man shall add unto them, God shall add unto him the plagues which are written in this book:"
31100,66022019,"and if any man shall take away from the words of the book of this prophecy, God shall take away his part from the tree of life, and out of the holy city, which are written in this book."
31101,66022020,"He who testifieth these things saith, Yea: I come quickly. Amen: come, Lord Jesus."
31102,66022021,The grace of the Lord Jesus be with the saints. Amen.


In [721]:
# Code from Andrew at Stack Overflow: https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers
def text2int (textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        #numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    if word.startswith(tuple(units)):
                        word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

In [722]:
book_key = pd.read_csv("data/key_english.csv")
book_key = book_key[["b", "n"]]
book_key.tail()

Unnamed: 0,b,n
61,62,1 John
62,63,2 John
63,64,3 John
64,65,Jude
65,66,Revelation


In [1]:
measures = pd.read_csv("data/measures.csv", header = 0, index_col = 0, squeeze=True).to_dict()

NameError: name 'pd' is not defined

In [2]:
measurement_roots = pd.read_csv('data/measurement_roots.csv', header=0, index_col=1, squeeze=True).to_dict()

NameError: name 'pd' is not defined

In [725]:
punctuation = [".", ";", ",", "!", "?", "(", ")", ":"]

In [726]:
"""
class Book:
    
    def __init__(self, name):
        self.book = name
        
class Chapter:
    
    def __init__(self, ch):
        self.ch = ch
        
class Verse:
    
    def _init(self, verse):
        self.verse = verse
"""

'\nclass Book:\n    \n    def __init__(self, name):\n        self.book = name\n        \nclass Chapter:\n    \n    def __init__(self, ch):\n        self.ch = ch\n        \nclass Verse:\n    \n    def _init(self, verse):\n        self.verse = verse\n'

In [727]:
class Unit_id:
    
    def __init__(self, Book, Chapter, Verse):
        self.book = str(Book)
        self.chapter = str(Chapter).zfill(3)
        self.verse = str(Verse).zfill(3)
        self.book_id_row = book_key.loc[book_key['n'] == Book]
        self.book_id = str(int(self.book_id_row['b']))
        self.id_list = [self.book_id, self.chapter, self.verse]
        self.id_concat = ''.join(self.id_list) 

In [844]:
class Id_to_tokenized_list:
    
    def __init__(self, id_number, units = 'imperial'):
        self.id_number = id_number
        self.units = units
        self.raw_text_row = asv_corpus.loc[asv_corpus['id'] == int(self.id_number)]
        self.raw_text = self.raw_text_row['t'].to_string(index = False)
        self.text_list = word_tokenize(self.raw_text)
        self.has_measurements = False
        self.m_root_words = []
        self.datetime = False
        self.has_or = False
        
    def remove_bad_punctuation(self):
        for i, j in enumerate(self.text_list):
            if j == '`':
                if self.text_list[i+2] == "'":
                    del self.text_list[i+2]
                    del self.text_list[i]
        
    def lemmatize_measurements(self): 
        if any(s in self.text_list for s in measurement_roots.keys()): 
            for word in set(self.text_list).intersection(measurement_roots.keys()):
                self.text_list[:] = [measurement_roots[word] if x == word else x for x in self.text_list]
                
    def join_multiword_measurements(self):            
        for i, j in enumerate(self.text_list):
            if j == 'cubit':
                if self.text_list[i-1] == 'long':
                    self.text_list[i-1:i+1] = [" ".join(self.text_list[i-1:i+1])]
            elif j == 'Sabbath':
                if  self.text_list[i+1] == 'day' and (self.text_list[i+3] == 'journey' or 'walk'):
                    self.text_list[i:i+4] = ["sabbath day's journey"]
            elif j == 'journey':
                if self.text_list[i-2] == 'day' or 'days':
                    self.text_list[i-2:i+1] = ["day's journey"]
            elif j == 'hour' or 'watch':
                if self.text_list[i-1] in ["second", "third", "fourth", "sixth",
                                           "seventh", "ninth", "tenth", "eleventh"]:
                    self.text_list[i-2:i+1] = [" ".join(self.text_list[i-2:i+1])]
        self.text_list = self.text_list
    
    def check_for_measurements(self):
        if any(s in self.text_list for s in measurement_roots.values()):
            self.has_measurements = True
        
        
    def convert_numbers(self):
        if any(s in self.text_list for s in measurement_roots.values()):
            self.m_root_words = list(set(self.text_list).intersection(measurement_roots.values()))
            
    def         
            for i, j in enumerate(self.text_list):
                if j in m_root_words:                    
                    if self.text_list[i-1] in ('a', 'an'):
                        self.text_list[i-1] = 1.0
                    elif j not in ("the third hour", "the sixth hour", "the seventh hour",
                           "the ninth hour", "the tenth hour", "the eleventh hour",
                           "the second watch", "the third watch", "the fourth watch"):                        
                        self.text_list[i-1] = float(text2int(self.text_list[i-1]))
                        if self.text_list[i-2] == 'or':
                            self.has_or = True
                            self.text_list[i-3] = float(text2int(self.text_list[i-3]))
                    else:
                        self.datetime = True
                    
                    if not self.datetime:    
                        if self.units == 'metric':
                            self.text_list[i-1] = str(self.text_list[i-1] * float(measures['metric_multiplier'][j]))
                            self.text_list[i] = (measures['metric'][j])
                            if has_or:
                                self.text_list[i-3] = str(self.text_list[i-1] * float(measures['metric_multiplier'][j]))
                        else:
                            self.text_list[i-1] = str(self.text_list[i-1] * float(measures['imperial_multiplier'][j]))
                            self.text_list[i] = (measures['imperial'][j])
                            if has_or:
                                self.text_list[i-3] = str(self.text_list[i-1] * float(measures['imperial_multiplier'][j]))
                    else:
                        self.text_list[i-1] = 'approximately'
                        self.text_list[i] = (measures['metric'][j])
    
    def compile_sentence(self):
        output = "".join([" "+i if not i.startswith("'") and i not in punctuation else i for i in self.text_list]).strip()
        return output

In [845]:
v = Unit_id('Acts', 3, 1)
r = Id_to_tokenized_list(v.id_concat)
r.remove_bad_punctuation()
r.lemmatize_measurements()
r.convert_numbers()
r.compile_sentence()

'Now Peter and John were going up into the temple at the hour of prayer, approximately 3:00 PM.'

In [846]:
v = Unit_id('John', 2, 6)
r = Id_to_tokenized_list(v.id_concat)
r.lemmatize_measurements()
r.convert_numbers()
r.compile_sentence()

TypeError: can't multiply sequence by non-int of type 'float'

In [836]:
v = Unit_id('Jeremiah', 52, 22)
r = Id_to_tokenized_list(v.id_concat)
r.lemmatize_measurements()
r.convert_numbers()
r.compile_sentence()

'And a capital of brass was upon it; and the height of the one capital was 7.5 feet, with network and pomegranates upon the capital round about, all of brass: and the second pillar also had like unto these, and pomegranates.'

In [837]:
v = Unit_id('1 Kings', 19, 4)
r = Id_to_tokenized_list(v.id_concat)
r.lemmatize_measurements()
r.convert_numbers()
r.compile_sentence()

'But he himself went 20.0 miles into the wilderness, and came and sat down under a juniper-tree: and he requested for himself that he might die, and said, It is enough; now, O Jehovah, take away my life; for I am not better than my fathers.'