Add Lemmatizer.isTagBaseForm and use it in spaCy extensions

bjascob · May 16, 2019 · ca6196e · ca6196e
1 parent f929363
commit ca6196e
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 434 deletions.
diff --git a/docs/lemmatizer.md b/docs/lemmatizer.md
@@ -7,7 +7,7 @@ The `Lemmatizer` class converts words from their inflected form to their base fo
 ## Examples
 Usage as a library
 ```
-> from lemminflect import getLemma, getAllLemmas, getAllLemmasOOV
+> from lemminflect import getLemma, getAllLemmas, getAllLemmasOOV, isTagBaseForm
 > getLemma('watches', upos='VERB')
 ('watch',)
 
@@ -16,6 +16,9 @@ Usage as a library
 
 > getAllLemmasOOV('xatches', 'NOUN')
 {'NOUN': ('xatch',)}
+
+> isTagBaseForm('JJ')
+True
 ```
 Usage as a entension to spaCy
 ```
@@ -32,7 +35,7 @@ test
 ```
 getLemma(word, upos, lemmatize_oov=True)
 ```
-This methods aggregates `getAllLemmas` and `getAllLemmasOOV`.  It first tries to find the lemma using the dictionary based lookup.  If no forms are available, it then tries to find the lemma using the rules system.
+This methods aggregates `getAllLemmas` and `getAllLemmasOOV`.  It first tries to find the lemma using the dictionary based lookup.  If no forms are available, it then tries to find the lemma using the rules system.  If a Penn Tag is available, it is best practice to first call `isTagBaseForm` (below), and only call this function if that is `False`.  Doing this will eliminate potentials errors from lemmatizing a word already in lemma form.
 
 Arguments
 
@@ -62,11 +65,21 @@ Arguments
 * **word:** word to lemmatize
 * **upos:** Universal Dependencies part of speech tag the returned values are limited to
 
+**isTagBaseForm**
+```
+isTagBaseForm(tag)
+```
+Returns `True` or `False` if the Penn Tag is a lemma form.  This is useful since lemmatizing a lemma can lead to errors.  The upos tags used in the above methods don't have enough information to determine this, but the Penn tags do.
+
+Arguments
+
+* **tag** Penn Treebank tag
+
 **Spacy Extension**
 ```
 Token._.lemma(form_num=0, lemmatize_oov=True, on_empty_ret_word=True)
 ```
-The extension is setup in spaCy automatically when LemmInflect is imported.  The above function defines the method added to `Token`.  Internally spaCy passes the `Token` to a method in `Lemmatizer` which in-turn calls `getLemma` and then returns the specified form number (ie.. the first spelling).
+The extension is setup in spaCy automatically when LemmInflect is imported.  The above function defines the method added to `Token`.  Internally spaCy passes the `Token` to a method in `Lemmatizer` which in-turn calls `getLemma` and then returns the specified form number (ie.. the first spelling).  For words who's Penn tag indicates they are already in lemma form, the original word is returned directly.
 
 * **form_num:** When multiple spellings exist, this determines which is returned.  The spellings are ordered from most common to least, as determined by a corpus unigram at the time the dictionary was created.
 * **lemmatize_oov:** Allows the method to use the rules based system for words not in the dictionary

diff --git a/lemminflect/__init__.py b/lemminflect/__init__.py
@@ -17,6 +17,8 @@ def getAllLemmasOOV(word, upos):
 def getLemma(word, upos, lemmatize_oov=True):
     return Lemmatizer().getLemma(word, upos, lemmatize_oov)
 
+def isTagBaseForm(tag):
+    return Lemmatizer.isTagBaseForm(tag)
 
 # Inflections is a singleton so this will only instantiate and load the data
 # (via the default fn) the first time it's called.

diff --git a/lemminflect/core/Inflections.py b/lemminflect/core/Inflections.py
@@ -144,17 +144,17 @@ def _extractForm(forms, tag):
     def spacyGetInfl(self, token, tag, form_num=0, inflect_oov=True, on_empty_ret_word=True):
         # Use LemmInflect lemmatizer
         if self.int_lemma is not None:
-            lemmas = ()
-            upos = tagToUPos(tag)
-            if upos in self.int_lemma.DICT_UPOS_TYPES:
-                lemmas = self.int_lemma.getLemma(token.text, upos, lemmatize_oov=True)
-            if not lemmas:
-                lemma = token.text
-            else:
-                lemma = lemmas[0]   # use the first spelling as the default
+            lemma = self.int_lemma.spacyGetLemma(token)
         # Use Spacy lemmatizer
         else:
             lemma = token.lemma_
+            # handle pronouns.  The isTagBaseForm will force the immediate return
+            # of these types anyway.
+            if lemma == '-PRON-':
+                lemma = token.text
+        # If the requested tag to inflect to is a base form already then the lemma is the inflection
+        if Lemmatizer.isTagBaseForm(tag):
+            return lemma
         # Put the caps style from the word on to the lemma to solve spaCy casing issues with lemmas.
         caps_style = getCapsStyle(token.text)
         lemma = applyCapsStyle(lemma, caps_style)

diff --git a/lemminflect/core/Lemmatizer.py b/lemminflect/core/Lemmatizer.py
@@ -70,17 +70,32 @@ def getLemma(self, word, upos, lemmatize_oov=True):
             return list(lemma_dict.values())[0]  # dict has only 1 value, but the value is a tuple
         assert False, 'More than 1 category value in lemmas: %s' % str(lemma_dict)
 
+    # Look at the Penn tag to see if a word is already in its base form
+    @staticmethod
+    def isTagBaseForm(tag):
+        if tag in ['NNS', 'NNPS']:
+            return False
+        elif len(tag)>2 and tag[:2] in ['JJ', 'RB', 'VB']:
+            return False
+        return True
+
     # Method for extending the spaCy tokens
     # Return the lemma or, if nothing was found, the original word
     def spacyGetLemma(self, token, form_num=0, lemmatize_oov=True, on_empty_ret_word=True):
+        # Don't try to lemmatize words that are already in their base forms
+        if self.isTagBaseForm(token.tag_):
+            return token.text
+        # Get the list of possible lemmas
         lemmas = ()
         if token.pos_ in self.DICT_UPOS_TYPES:
             lemmas = self.getLemma(token.text, token.pos_, lemmatize_oov)
+        # Handle no lemmas returned
         if not lemmas:
             if on_empty_ret_word:
                 return token.text
             else:
                 return None
+        # Or select the correct lemma form number
         elif len(lemmas) > form_num:
             return lemmas[form_num]
         else: