# Basic Discombibleator

In [1]:
import numpy as np
import pandas as pd

measures = pd.read_csv("data/measures.csv", header = 0, index_col = 0, squeeze=True).to_dict()
measures

{'imperial': {'finger': 'inches',
  'handbreadth': 'inches',
  'span': 'inches',
  'cubit': 'feet',
  'long cubit': 'inches',
  'fathom': 'feet',
  'reed': 'feet',
  'furlong': 'feet',
  'stadion': 'feet',
  "sabbath day's journey": 'miles',
  "day's journey": 'miles',
  'gerah': 'ounces',
  'bekah': 'ounces',
  'pim': 'ounces',
  'shekel': 'ounces',
  'mina': 'pounds',
  'talent': 'USD',
  'log': 'pints',
  'kab': 'pints',
  'hin': 'gallons',
  'bath': 'gallons',
  'homer': 'gallons',
  'kor': 'gallons',
  'metretes': 'gallons',
  'omer': 'quarts',
  'seah': 'quarts',
  'ephah': 'gallons',
  'lethech': 'gallons',
  'denarius': 'USD',
  'daric': 'USD',
  'drachma': 'USD',
  'didrachma': 'USD',
  'the third hour': '9:00 AM',
  'the sixth hour': '12:00 noon',
  'the seventh hour': '1:00 PM',
  'the ninth hour': '3:00 PM',
  'the tenth hour': '4:00 PM',
  'the eleventh hour': '5:00 PM',
  'the second watch': 'between 9 PM and midnight',
  'the third watch': 'between midnight and 3 AM',
  

In [2]:
measurement_roots = pd.read_csv('data/measurement_roots.csv', header=0, index_col=1, squeeze=True).to_dict()
measurement_roots

{'fingers': 'finger',
 'handbreadths': 'handbreadth',
 'spans': 'span',
 'cubits': 'cubit',
 'long cubits': 'long cubit',
 'fathoms': 'fathom',
 'reeds': 'reed',
 'furlongs': 'furlong',
 'stadia': 'stadion',
 "sabbath day's walk": "sabbath day's journey",
 "days' journey": "day's journey",
 'gerahs': 'gerah',
 'bekahs': 'bekah',
 'pims': 'pim',
 'shekels': 'shekel',
 'minas': 'mina',
 'talents': 'talent',
 'logs': 'log',
 'kabs': 'kab',
 'hins': 'hin',
 'baths': 'bath',
 'homers': 'homer',
 'kors': 'kor',
 'firkins': 'metretes',
 'firkin': 'metretes',
 'metretas': 'metretes',
 'metrete': 'metretes',
 'omers': 'omer',
 'seahs': 'seah',
 'ephahs': 'ephah',
 'lethechs': 'lethech',
 'denarii': 'denarius',
 'drachmae': 'drachma',
 'didrachmae': 'didrachma',
 'the third hour': 'the third hour',
 'the sixth hour': 'the sixth hour',
 'the seventh hour': 'the seventh hour',
 'the ninth hour': 'the ninth hour',
 'the tenth hour': 'the tenth hour',
 'the eleventh hour': 'the eleventh hour',
 'the

In [3]:
class Biblical_measurement:
    
    def __init__(self, string, self.units = "imperial"):
        self.string = string
        self.tokenized_string = self.string.split(" ")
        self.units = self.units
        self.measurement_found = False
        self.lemmatized = False
        self.nums_converted = False
        
    multiword_signifiers = ["hour", "watch", "journey",
                            "walk", "cubit", "cubits"]
    
    ordinal_times = ["second", "third", "fourth", "sixth",
                    "seventh", "ninth", "tenth", "eleventh"]
    
    def concat_multiword(self):
        if np.intersect1d(self.tokenized_string, multiword_signifiers):
            for i, j in enumerate(self.tokenized_string):
                if j in multiword_checklist:
                    if self.tokenized_string[i-1] in ordinal_times:
                        self.tokenized_string[i-2:i+1] = [" ".join(self.text_list[i-2:i+1])]
                    elif j in ("journey", "walk"):
                        if self.tokenized_string[i-2] in ("Sabbath", "sabbath"):
                            self.tokenized_string[i-2:i+1] = ["sabbath day's journey"]
                        else:
                            self.tokenized_string[i-1:i+1] = ["day's journey"]
                    elif j in ("cubit", "cubits"):
                        if self.tokenized_string[i-1] == 'long':
                            self.tokenized_string[i-1:i+1] = [" ".join(self.text_list[i-1:i+1])]
        self.tokenized_string = self.tokenized_string                  
        
    def has_measure_words(self):
        if any(s in self.tokenized_string for s in measurement_roots.keys()): # change to assertion
            self.measurement_found = True 
        elif any(s in self.tokenized_string for s in measurement_roots.values()): # change to assertion
            self.measurement_found = True 
        else:
            print("Measurement to be converted not found in input text:\n{}".format(self.string))
            
    def lemmatize_measure_words(self):
        if self.measurement_found: #change to assertion
            for word in set(self.tokenized_string).intersection(measurement_roots.keys()):
                self.tokenized_string[:] = [measurement_roots[word] if x == word else x for x in self.tokenized_string]
            self.lemmatized = True
    
    def Represents_Int(s):
        try: 
            int(s)
            return True
        except ValueError:
            return False
        
    def number_converter(self, num, measure_word):
        if self.units = "metric":
            num = float(num) * float(measures['metric_multiplier'][measure_word])
        else:
            num = float(num) * float(measures['imperial_multiplier'][measure_word])
        return num
    
    def measure_word_converter(self, word):
        return (measures[self.units][word])
    
    def find_convert_numbers(self):
        if self.lemmatized: # turn into assertion
            for i, j in enumerate(self.tokenized_string):
                if j in measurement_roots.values():
                    for unit in self.tokenized_string[i-4:i-1]:
                        unit_locator = self.tokenized_string.index(unit)
                        Represents_Int(unit):
                            self.tokenized_string[unit_locator] = number_converter(unit, j)
            self.nums_converted = True
            
    def convert_measure_words(self):
        if self.nums_converted: # turn into assertion
            for i, j in enumerate(self.tokenized_string):
                if j in measurement_roots.values():
                    self.tokenized_string[i] = measure_word_converter(j)
                    
    def join_elements(self):
        output = " ".join(self.text_list).strip()
        return output
            
            
                            
                        
            
    
            