# Basic Discombibleator

In [76]:
import numpy as np
import pandas as pd

measures = pd.read_csv("data/measures.csv", header = 0, index_col = 0, squeeze=True).to_dict()
measures

{'imperial': {'finger': 'inches',
  'handbreadth': 'inches',
  'span': 'inches',
  'cubit': 'feet',
  'long cubit': 'inches',
  'fathom': 'feet',
  'reed': 'feet',
  'furlong': 'feet',
  'stadion': 'feet',
  "sabbath day's journey": 'miles',
  "day's journey": 'miles',
  'gerah': 'ounces',
  'bekah': 'ounces',
  'pim': 'ounces',
  'shekel': 'ounces',
  'mina': 'pounds',
  'talent': 'pounds',
  'log': 'pints',
  'kab': 'pints',
  'hin': 'gallons',
  'bath': 'gallons',
  'homer': 'gallons',
  'kor': 'gallons',
  'metretes': 'gallons',
  'omer': 'quarts',
  'seah': 'quarts',
  'ephah': 'gallons',
  'lethech': 'gallons',
  'denarius': 'USD',
  'daric': 'USD',
  'drachma': 'USD',
  'didrachma': 'USD',
  'the third hour': '9:00 AM',
  'the sixth hour': '12:00 noon',
  'the seventh hour': '1:00 PM',
  'the ninth hour': '3:00 PM',
  'the tenth hour': '4:00 PM',
  'the eleventh hour': '5:00 PM',
  'the second watch': 'between 9 PM and midnight',
  'the third watch': 'between midnight and 3 AM',

In [77]:
measurement_roots = pd.read_csv('data/measurement_roots.csv', header=0, index_col=1, squeeze=True).to_dict()
measurement_roots

{'fingers': 'finger',
 'handbreadths': 'handbreadth',
 'spans': 'span',
 'cubits': 'cubit',
 'long cubits': 'long cubit',
 'fathoms': 'fathom',
 'reeds': 'reed',
 'furlongs': 'furlong',
 'stadia': 'stadion',
 "sabbath day's walk": "sabbath day's journey",
 "days' journey": "day's journey",
 'gerahs': 'gerah',
 'bekahs': 'bekah',
 'pims': 'pim',
 'shekels': 'shekel',
 'minas': 'mina',
 'talents': 'talent',
 'logs': 'log',
 'kabs': 'kab',
 'hins': 'hin',
 'baths': 'bath',
 'homers': 'homer',
 'kors': 'kor',
 'firkins': 'metretes',
 'firkin': 'metretes',
 'metretas': 'metretes',
 'metrete': 'metretes',
 'omers': 'omer',
 'seahs': 'seah',
 'ephahs': 'ephah',
 'lethechs': 'lethech',
 'denarii': 'denarius',
 'drachmae': 'drachma',
 'didrachmae': 'didrachma',
 'the third hour': 'the third hour',
 'the sixth hour': 'the sixth hour',
 'the seventh hour': 'the seventh hour',
 'the ninth hour': 'the ninth hour',
 'the tenth hour': 'the tenth hour',
 'the eleventh hour': 'the eleventh hour',
 'the

In [152]:
class Biblical_Measurement:
    
    def __init__(self, string, units = "imperial"):
        self.string = string
        self.tokenized_string = self.string.split(" ")
        self.units = units
        self.measurement_found = False
        self.lemmatized = False
        self.nums_converted = False
        self.multiword_signifiers = ["hour", "watch", "journey",
                                    "walk", "cubit", "cubits"]
        self.ordinal_times = ["second", "third", "fourth", "sixth",
                            "seventh", "ninth", "tenth", "eleventh"]
    
    def concat_multiword(self):
        if any(np.intersect1d(self.tokenized_string, self.multiword_signifiers)):
            for i, j in enumerate(self.tokenized_string):
                if j in self.multiword_signifiers:
                    if self.tokenized_string[i-1] in self.ordinal_times:
                        self.tokenized_string[i-2:i+1] = [" ".join(self.tokenized_string[i-2:i+1])]
                    elif j in ("journey", "walk"):
                        if self.tokenized_string[i-2] in ("Sabbath", "sabbath"):
                            self.tokenized_string[i-2:i+1] = ["sabbath day's journey"]
                        else:
                            self.tokenized_string[i-1:i+1] = ["day's journey"]
                    elif j in ("cubit", "cubits"):
                        if self.tokenized_string[i-1] == 'long':
                            self.tokenized_string[i-1:i+1] = [" ".join(self.text_list[i-1:i+1])]
        self.tokenized_string = self.tokenized_string                  
        
    def has_measure_words(self):
        if any(s in self.tokenized_string for s in measurement_roots.keys()): # change to assertion
            self.measurement_found = True 
        elif any(s in self.tokenized_string for s in measurement_roots.values()): # change to assertion
            self.measurement_found = True 
        else:
            print("Measurement to be converted not found in input text:\n{}".format(self.string))
            
    def lemmatize_measure_words(self):
        if self.measurement_found: #change to assertion
            for word in set(self.tokenized_string).intersection(measurement_roots.keys()):
                self.tokenized_string[:] = [measurement_roots[word] if x == word else x for x in self.tokenized_string]
            self.lemmatized = True
    
    def Represents_Int(self, s):
        try: 
            int(s)
            return True
        except ValueError:
            return False
        
    def number_converter(self, num, measure_word):
        if self.units == "metric":
            num = float(num) * float(measures['metric_multiplier'][measure_word])
        else:
            num = float(num) * float(measures['imperial_multiplier'][measure_word])
        return str(num)
    
    def measure_word_converter(self, word):
        return (measures[self.units][word])
    
    def find_convert_numbers(self):
        if self.lemmatized: # turn into assertion
            for i, j in enumerate(self.tokenized_string):
                if j in measurement_roots.values():
                    for unit in self.tokenized_string[:i]: # I don't like that this might look so extensively back.
                        if (i - self.tokenized_string.index(unit)) <= 4:
                            unit_locator = self.tokenized_string.index(unit)
                            if self.Represents_Int(unit):
                                self.tokenized_string[unit_locator] = self.number_converter(unit, j)
                            elif unit in ("a", "an", "the", "A", "An", "The"):
                                if(self.tokenized_string.index(j) - unit_locator) in range(2):
                                    self.tokenized_string[unit_locator] = self.number_converter(1, j)
            self.nums_converted = True
            
    def convert_measure_words(self):
        if self.nums_converted: # turn into assertion
            for i, j in enumerate(self.tokenized_string):
                if j in measurement_roots.values():
                    self.tokenized_string[i] = self.measure_word_converter(j)
        print(self.tokenized_string)
                    
    def join_elements(self):
        for element in self.tokenized_string:
            output = " ".join(self.tokenized_string).strip()
        return output
            
            
                            
                        
            
    
            

In [153]:
x = Biblical_Measurement("It was 12 cubits tall and we saw it during the ninth hour")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

this worked
['It', 'was', '18.0', 'feet', 'tall', 'and', 'we', 'saw', 'it', 'during', '3:00 PM']


'It was 18.0 feet tall and we saw it during 3:00 PM'

In [154]:
x = Biblical_Measurement("It was 12 or 13 cubits tall and we saw it during the ninth hour")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

this worked
this worked
['It', 'was', '18.0', 'or', '19.5', 'feet', 'tall', 'and', 'we', 'saw', 'it', 'during', '3:00 PM']


'It was 18.0 or 19.5 feet tall and we saw it during 3:00 PM'

In [155]:
x = Biblical_Measurement("The bath of wine cost 12 or 13 talents of gold and a denarius and we saw it during the ninth hour")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

we found The
this worked
this worked
we found a
['5.9', 'gallons', 'of', 'wine', 'cost', '900.0', 'or', '975.0', 'pounds', 'of', 'gold', 'and', '3.62', 'USD', 'and', 'we', 'saw', 'it', 'during', '3:00 PM']


'5.9 gallons of wine cost 900.0 or 975.0 pounds of gold and 3.62 USD and we saw it during 3:00 PM'