# Basic Discombibleator

In [3]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

measures = pd.read_csv("data/measures.csv", header = 0, index_col = 0, squeeze=True).to_dict()

In [4]:
measurement_roots = pd.read_csv('data/measurement_roots.csv', header=0, index_col=1, squeeze=True).to_dict()

In [7]:
class Biblical_Measurement:
    
    """This class contains a series of functions that can transform inputs 
    in Ancient Hebrew units into modern Imperial or Metric units.
    
    """
    
    def __init__(self, string, units = "imperial"):
        """Method initializing Biblical_Measurement object.
        
        Args:
            string (str)
            units (str)
        
        Attributes:
            string (str): String of text to be converted into modern measurements.
            arr (arr): Class' string attribute word-tokenized through NLTK.
            units (string): String indicating whether output units will be in 
                Imperial or Metric units. Default is Imperial units.
            measurement_found (bool): Boolean indicating a valid Ancient Hebrew
                measurement has been found in self.arr and can be processed.
            lemmatized (bool): Boolean indicating the lemitization step has been 
                executed.
            nums_converted (bool): Boolean indicating numbers have been converted.
            mw_converted (bool): Boolean indicating measure words have been converted.
            multiword signifiers (arr): Array of words signaling the possible 
                presence of multiword measure words tokenized into multiple items.
            ordinal_times (arr): Array of words indicating a specific time.
            punctuation (arr): List if punctuation to ease detokenizing sentences.
        """
        self.string = string
        self.arr = word_tokenize(self.string)
        self.units = units
        self.measurement_found = False
        self.lemmatized = False
        self.nums_converted = False
        self.mw_converted = False
        self.multiword_signifiers = ["hour", "watch", "journey",
                                    "walk", "cubit", "cubits"]
        self.ordinal_times = ["second", "third", "fourth", "sixth",
                            "seventh", "ninth", "tenth", "eleventh"]
        self.punctuation = [".", ";", ",", "!", "?", "(", ")", ":"]
    
    def Concat_Multiword(self):
        """Method finding and concatenating tokenized multi-word measure words.
        
        Args:
            None
        
        Returns:
            None
        """
        if any(np.intersect1d(self.arr, self.multiword_signifiers)):
            for i, j in enumerate(self.arr):
                if j in self.multiword_signifiers:
                    if self.arr[i-1] in self.ordinal_times:
                        self.arr[i-2:i+1] = [" ".join(self.arr[i-2:i+1])]
                    elif j in ("journey", "walk"):
                        if self.arr[i-2] in ("Sabbath", "sabbath"):
                            self.arr[i-2:i+1] = ["sabbath day's journey"]
                        else:
                            self.arr[i-1:i+1] = ["day's journey"]
                    elif j in ("cubit", "cubits"):
                        if self.arr[i-1] == 'long':
                            self.arr[i-1:i+1] = [" ".join(arr[i-1:i+1])]
        self.tokenized_string = self.arr  
           
    def Has_Measure_Words(self):
        """Method checking whether a valid measurement can be found in the input.
        
        Args:
            None
        
        Returns:
            None    
        """
        arr = self.tokenized_string
        if any(s in arr for s in measurement_roots.keys()): # concatenate keys and values into one list
            self.measurement_found = True 
        elif any(s in arr for s in measurement_roots.values()): # change to assertion
            self.measurement_found = True 
        else:
            print("Measurement to be converted not found in input text:\n{}".format(self.string))
        self.tokenized_string = arr
            
    def Lemmatize_Measure_Words(self):
        """Method lemmatizing measure words in input.
        
        Args:
            None
        
        Returns:
            None    
        """
        arr = self.tokenized_string
        if self.measurement_found: #change to assertion
            for word in set(arr).intersection(measurement_roots.keys()):
                arr[:] = [measurement_roots[word] if x == word else x for x in arr]
            self.lemmatized = True
        self.tokenized_string = arr
    
    def Represents_Int(self, s):
        """Method checking whether an input is a string representation of an integer.
        
        Args:
            s (string): string item to be checked
        
        Returns:
            (bool): whether the input is a string representation of an integer    
        """
        try: 
            int(s)
            return True
        except ValueError:
            return False
        
    def Number_Converter(self, n, measure_word):
        """Method converting number from units in Ancient Hebrew measurements to
        units in modern measurements.
        
        Args:
            n (int): number to be converted
            measure_word (str): Ancient Hebrew units in which n was measured
        
        Returns:
            (str): string of the float of n converted into Imperial or Metric units
        """
        if self.units == "metric":
            n = float(n) * float(measures['metric_multiplier'][measure_word])
        else:
            n = float(n) * float(measures['imperial_multiplier'][measure_word])
        return str(n)
    
    def Measure_Word_Converter(self, word):
        """Method converting measure words into Imperial or Metric measures.
        
        Args:
            word(str): Ancient Hebrew measure word to be converted
        
        Returns:
            (str): string of corresponding metric or imperial measure    
        """
        return (measures[self.units][word])
    
    def Find_Convert_Numbers(self):
        """Method locating and converting relevant numbers.
        
        Args:
            None
        
        Returns:
            None    
        """
        arr = self.tokenized_string
        if self.lemmatized: # turn into assertion
            for i, j in enumerate(arr):
                if j in measurement_roots.values():
                    for unit in arr[:i]: # I don't like that this might look so extensively back.
                        if (i - arr.index(unit)) <= 4:
                            unit_locator = arr.index(unit)
                            if self.Represents_Int(unit):
                                arr[unit_locator] = self.Number_Converter(unit, j)
                            elif unit in ("a", "an", "the", "A", "An", "The"):
                                if(arr.index(j) - unit_locator) in range(2):
                                    arr[unit_locator] = self.Number_Converter(1, j)
            self.nums_converted = True
        self.tokenized_string = arr
            
    def Convert_Measure_Words(self):
        """Method locating and converting relevant numbers.
        
        Args:
            None
        
        Returns:
            None    
        """
        arr = self.tokenized_string
        if self.nums_converted: # turn into assertion
            for i, j in enumerate(arr):
                if j in measurement_roots.values():
                    arr[i] = self.Measure_Word_Converter(j)
        self.mw_converted = True
        self.tokenized_string = arr
                    
    def Join_Elements(self):
        """Method detokenizing array of words into final sentence.
        
        Args:
            None
        
        Returns:
            None    
        """
        arr = self.tokenized_string
        for element in arr:
            output = "".join([" "+i if not i.startswith("'") and i not in self.punctuation else i for i in arr]).strip()
        self.tokenized_string = arr
        return output
            
class Verse:
    
    def __init__(self, string):
        self.string = string
        
    def Return_String(self):
        return self.string    

In [10]:
def input_validator(inputs):
    assert True == True
    assert False ==False
    
def Test_Concat_Multiword(arr):
        assert Concat_Multiword(word_tokenize("At the seventh hour, we ate a long cubit of bread.")) == ["At", "the", "seventh",
                                                 "hour", ",", "we", "ate",
                                                 "a", "long", "cubit", "of",
                                                 "bread", "."]
        

In [11]:
y = Verse("2 drachmae")
y.Return_String()

x = Biblical_Measurement(y.Return_String())
x.Concat_Multiword()
x.Has_Measure_Words()
x.Lemmatize_Measure_Words()
x.Find_Convert_Numbers()
x.Convert_Measure_Words()
x.Join_Elements()

'1.3 USD'

In [223]:
x = Biblical_Measurement("It was 12 cubits tall and we saw it during the ninth hour.")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

AttributeError: 'Biblical_Measurement' object has no attribute 'concat_multiword'

In [219]:
x = Biblical_Measurement("It was 12 or 13 cubits tall and we saw it during the ninth hour")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

AttributeError: 'Biblical_Measurement' object has no attribute 'concat_multiword'

In [174]:
x = Biblical_Measurement("The bath of wine cost 12 or 13 talents of gold and a denarius; and we saw it during the ninth hour.")
x.concat_multiword()
x.has_measure_words()
x.lemmatize_measure_words()
x.find_convert_numbers()
x.convert_measure_words()
x.join_elements()

'5.9 gallons of wine cost 900.0 or 975.0 pounds of gold and 3.62 USD; and we saw it during 3:00 PM.'