# Basic Discombibleator

In [160]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

measures = pd.read_csv("data/measures.csv", header = 0, index_col = 0, squeeze=True).to_dict()

In [161]:
measurement_roots = pd.read_csv('data/measurement_roots.csv', header=0, index_col=1, squeeze=True).to_dict()

In [206]:
class Biblical_Measurement:
    
    def __init__(self, string, units = "imperial"):
        self.string = string
        self.tokenized_string = word_tokenize(self.string)
        self.units = units
        self.measurement_found = False
        self.lemmatized = False
        self.nums_converted = False
        self.multiword_signifiers = ["hour", "watch", "journey",
                                    "walk", "cubit", "cubits"]
        self.ordinal_times = ["second", "third", "fourth", "sixth",
                            "seventh", "ninth", "tenth", "eleventh"]
        self.punctuation = [".", ";", ",", "!", "?", "(", ")", ":"]
    
    def Concat_Multiword(self):
        if any(np.intersect1d(self.tokenized_string, self.multiword_signifiers)):
            for i, j in enumerate(self.tokenized_string):
                if j in self.multiword_signifiers:
                    if self.tokenized_string[i-1] in self.ordinal_times:
                        self.tokenized_string[i-2:i+1] = [" ".join(self.tokenized_string[i-2:i+1])]
                    elif j in ("journey", "walk"):
                        if self.tokenized_string[i-2] in ("Sabbath", "sabbath"):
                            self.tokenized_string[i-2:i+1] = ["sabbath day's journey"]
                        else:
                            self.tokenized_string[i-1:i+1] = ["day's journey"]
                    elif j in ("cubit", "cubits"):
                        if self.tokenized_string[i-1] == 'long':
                            self.tokenized_string[i-1:i+1] = [" ".join(self.text_list[i-1:i+1])]
        self.tokenized_string = self.tokenized_string                  
        
    def Has_Measure_Words(self):
        if any(s in self.tokenized_string for s in measurement_roots.keys()): # change to assertion
            self.measurement_found = True 
        elif any(s in self.tokenized_string for s in measurement_roots.values()): # change to assertion
            self.measurement_found = True 
        else:
            print("Measurement to be converted not found in input text:\n{}".format(self.string))
            
    def Lemmatize_Measure_Words(self):
        if self.measurement_found: #change to assertion
            for word in set(self.tokenized_string).intersection(measurement_roots.keys()):
                self.tokenized_string[:] = [measurement_roots[word] if x == word else x for x in self.tokenized_string]
            self.lemmatized = True
    
    def Represents_Int(self, s):
        try: 
            int(s)
            return True
        except ValueError:
            return False
        
    def Number_Converter(self, num, measure_word):
        if self.units == "metric":
            num = float(num) * float(measures['metric_multiplier'][measure_word])
        else:
            num = float(num) * float(measures['imperial_multiplier'][measure_word])
        return str(num)
    
    def Measure_Word_Converter(self, word):
        return (measures[self.units][word])
    
    def Find_Convert_Numbers(self):
        if self.lemmatized: # turn into assertion
            for i, j in enumerate(self.tokenized_string):
                if j in measurement_roots.values():
                    for unit in self.tokenized_string[:i]: # I don't like that this might look so extensively back.
                        if (i - self.tokenized_string.index(unit)) <= 4:
                            unit_locator = self.tokenized_string.index(unit)
                            if self.Represents_Int(unit):
                                self.tokenized_string[unit_locator] = self.Number_Converter(unit, j)
                            elif unit in ("a", "an", "the", "A", "An", "The"):
                                if(self.tokenized_string.index(j) - unit_locator) in range(2):
                                    self.tokenized_string[unit_locator] = self.Number_Converter(1, j)
            self.nums_converted = True
            
    def Convert_Measure_Words(self):
        if self.nums_converted: # turn into assertion
            for i, j in enumerate(self.tokenized_string):
                if j in measurement_roots.values():
                    self.tokenized_string[i] = self.Measure_Word_Converter(j)
                    
    def Join_Elements(self):
        for element in self.tokenized_string:
            output = "".join([" "+i if not i.startswith("'") and i not in self.punctuation else i for i in self.tokenized_string]).strip()
        return output
            
class Verse:
    
    def __init__(self, string):
        self.string = string
        
    def Return_String(self):
        return self.string    

In [204]:
y = Verse("It was 15 spans and we saw it during the second watch.")
y.Return_String()

x = Biblical_Measurement(y.Return_String())
x.Concat_Multiword()
x.Has_Measure_Words()
x.Lemmatize_Measure_Words()
x.Find_Convert_Numbers()
x.Convert_Measure_Words()
x.Join_Elements()

'It was 135.0 inches and we saw it during between 9 PM and midnight.'

In [172]:
x = Biblical_Measurement("It was 12 cubits tall and we saw it during the ninth hour.")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

'It was 18.0 feet tall and we saw it during 3:00 PM.'

In [173]:
x = Biblical_Measurement("It was 12 or 13 cubits tall and we saw it during the ninth hour")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

'It was 18.0 or 19.5 feet tall and we saw it during 3:00 PM'

In [174]:
x = Biblical_Measurement("The bath of wine cost 12 or 13 talents of gold and a denarius; and we saw it during the ninth hour.")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

'5.9 gallons of wine cost 900.0 or 975.0 pounds of gold and 3.62 USD; and we saw it during 3:00 PM.'