Merge branch 'develop'

amaiya · Oct 9, 2020 · 7c945ae · 7c945ae
2 parents e729fa0 + d6862c3
commit 7c945ae
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,21 @@ Most recent releases are shown at the top. Each release shows:
 - **Changed**: Additional parameters, changes to inputs or outputs, etc
 - **Fixed**: Bug fixes that don't change documented behaviour
 
+## 0.22.2 (2020-10-09)
+
+### New:
+- added `extract_noun_phrases` to `textutils`
+
+### Changed
+- `SimpleQA.ask` now includes an `include_np` parameter.  When True, noun phrases will be used to retrieve documents 
+   containing candidate answers.
+
+
+### Fixed:
+- N/A
+
+
+
 ## 0.22.1 (2020-10-08)
 
 ### New:

diff --git a/ktrain/text/qa/core.py b/ktrain/text/qa/core.py
@@ -38,6 +38,24 @@ def display_answers(answers):
     return display(HTML(df.to_html(render_links=True, escape=False)))
 
 
+def _process_question(question, include_np=False):
+    if include_np:
+        try:
+            # attempt to use extract_noun_phrases first if textblob is installed
+            np_list = ['"%s"' % (np) for np in TU.extract_noun_phrases(question) if len(np.split()) > 1]
+            q_tokens = TU.tokenize(question, join_tokens=False)
+            q_tokens.extend(np_list)
+            return " ".join(q_tokens)
+        except:
+            import warnings
+            warnings.warn('TextBlob is not currently installed, so falling back to np=False with no extra question processing. '+\
+                          'To install: pip install textblob')
+            return TU.tokenize(question, join_tokens=True)
+    else:
+        return TU.tokenize(question, join_tokens=True)
+
+
+
 class QA(ABC):
     """
     Base class for QA
@@ -167,7 +185,8 @@ def _expand_answer(self, answer):
 
 
 
-    def ask(self, question, batch_size=8, n_docs_considered=10, n_answers=50, rerank_threshold=0.015):
+    def ask(self, question, batch_size=8, n_docs_considered=10, n_answers=50, 
+            rerank_threshold=0.015, include_np=False):
         """
         submit question to obtain candidate answers
 
@@ -187,14 +206,20 @@ def ask(self, question, batch_size=8, n_docs_considered=10, n_answers=50, rerank
                                  This can help bump the correct answer closer to the top.
                                  default:0.015.
                                  If None, no re-ranking is performed.
+          include_np(bool):  If True, noun phrases will be extracted from question and included
+                             in query that retrieves documents likely to contain candidate answers.
+                             This may be useful if you ask a question about artificial intelligence
+                             and the answers returned pertain just to intelligence, for example.
+                             Note: include_np=True requires textblob be installed.
+                             Default:False
         Returns:
           list
         """
         # locate candidate document contexts
         paragraphs = []
         refs = []
         #doc_results = self.search(question, limit=n_docs_considered)
-        doc_results = self.search(TU.tokenize(question, join_tokens=True), limit=n_docs_considered)
+        doc_results = self.search(_process_question(question, include_np=include_np), limit=n_docs_considered)
         if not doc_results: 
             warnings.warn('No documents matched words in question')
             return []

diff --git a/ktrain/text/textutils.py b/ktrain/text/textutils.py
@@ -344,3 +344,29 @@ def paragraph_tokenize(text, join_sentences=False, lang=None):
         if join_sentences: sents = ' '.join(sents)
         paragraphs.append(sents)
     return paragraphs
+
+
+def extract_noun_phrases(text):
+    """
+    extracts noun phrases
+    """
+    try:
+        from textblob import TextBlob
+    except:
+        raise Exception('extract_noun_phrases require TextBlob: pip install textblob')
+    blob = TextBlob(text)
+    stop_words = ['which', 'what']
+    curr_phrase = []
+    np_list = []
+    start = False
+    for token in blob.tags:
+        if token[1].startswith('J') or token[1].startswith('N'):
+            if not start: start = True
+            if token[0].lower() not in stop_words: curr_phrase.append(token[0])
+        else:
+            if start:
+                np_list.append(" ".join(curr_phrase))
+                curr_phrase = []
+                start = False
+    return np_list
+
diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.22.1'
+__version__ = '0.22.2'
diff --git a/setup.py b/setup.py
@@ -50,6 +50,7 @@
           #'eli5 >= 0.10.0', # forked version used by TextPredictor.explain and ImagePredictor.explain
           #'stellargraph>=0.8.2', # forked version used by graph module
           #'allennlp', # required for Elmo embeddings since TF2 TF_HUB does not work
+          #'textblob', # used by textutils.extract_noun_phrases
       ],
   classifiers=[  # Optional
     # How mature is this project? Common values are