Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Oct 9, 2020
2 parents e729fa0 + d6862c3 commit 7c945ae
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 3 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@ Most recent releases are shown at the top. Each release shows:
- **Changed**: Additional parameters, changes to inputs or outputs, etc
- **Fixed**: Bug fixes that don't change documented behaviour

## 0.22.2 (2020-10-09)

### New:
- added `extract_noun_phrases` to `textutils`

### Changed
- `SimpleQA.ask` now includes an `include_np` parameter. When True, noun phrases will be used to retrieve documents
containing candidate answers.


### Fixed:
- N/A



## 0.22.1 (2020-10-08)

### New:
Expand Down
29 changes: 27 additions & 2 deletions ktrain/text/qa/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,24 @@ def display_answers(answers):
return display(HTML(df.to_html(render_links=True, escape=False)))


def _process_question(question, include_np=False):
if include_np:
try:
# attempt to use extract_noun_phrases first if textblob is installed
np_list = ['"%s"' % (np) for np in TU.extract_noun_phrases(question) if len(np.split()) > 1]
q_tokens = TU.tokenize(question, join_tokens=False)
q_tokens.extend(np_list)
return " ".join(q_tokens)
except:
import warnings
warnings.warn('TextBlob is not currently installed, so falling back to np=False with no extra question processing. '+\
'To install: pip install textblob')
return TU.tokenize(question, join_tokens=True)
else:
return TU.tokenize(question, join_tokens=True)



class QA(ABC):
"""
Base class for QA
Expand Down Expand Up @@ -167,7 +185,8 @@ def _expand_answer(self, answer):



def ask(self, question, batch_size=8, n_docs_considered=10, n_answers=50, rerank_threshold=0.015):
def ask(self, question, batch_size=8, n_docs_considered=10, n_answers=50,
rerank_threshold=0.015, include_np=False):
"""
submit question to obtain candidate answers
Expand All @@ -187,14 +206,20 @@ def ask(self, question, batch_size=8, n_docs_considered=10, n_answers=50, rerank
This can help bump the correct answer closer to the top.
default:0.015.
If None, no re-ranking is performed.
include_np(bool): If True, noun phrases will be extracted from question and included
in query that retrieves documents likely to contain candidate answers.
This may be useful if you ask a question about artificial intelligence
and the answers returned pertain just to intelligence, for example.
Note: include_np=True requires textblob be installed.
Default:False
Returns:
list
"""
# locate candidate document contexts
paragraphs = []
refs = []
#doc_results = self.search(question, limit=n_docs_considered)
doc_results = self.search(TU.tokenize(question, join_tokens=True), limit=n_docs_considered)
doc_results = self.search(_process_question(question, include_np=include_np), limit=n_docs_considered)
if not doc_results:
warnings.warn('No documents matched words in question')
return []
Expand Down
26 changes: 26 additions & 0 deletions ktrain/text/textutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,29 @@ def paragraph_tokenize(text, join_sentences=False, lang=None):
if join_sentences: sents = ' '.join(sents)
paragraphs.append(sents)
return paragraphs


def extract_noun_phrases(text):
"""
extracts noun phrases
"""
try:
from textblob import TextBlob
except:
raise Exception('extract_noun_phrases require TextBlob: pip install textblob')
blob = TextBlob(text)
stop_words = ['which', 'what']
curr_phrase = []
np_list = []
start = False
for token in blob.tags:
if token[1].startswith('J') or token[1].startswith('N'):
if not start: start = True
if token[0].lower() not in stop_words: curr_phrase.append(token[0])
else:
if start:
np_list.append(" ".join(curr_phrase))
curr_phrase = []
start = False
return np_list

2 changes: 1 addition & 1 deletion ktrain/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ['__version__']
__version__ = '0.22.1'
__version__ = '0.22.2'
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#'eli5 >= 0.10.0', # forked version used by TextPredictor.explain and ImagePredictor.explain
#'stellargraph>=0.8.2', # forked version used by graph module
#'allennlp', # required for Elmo embeddings since TF2 TF_HUB does not work
#'textblob', # used by textutils.extract_noun_phrases
],
classifiers=[ # Optional
# How mature is this project? Common values are
Expand Down

0 comments on commit 7c945ae

Please sign in to comment.