Merge branch 'develop'

amaiya · Oct 26, 2020 · 9da03c4 · 9da03c4
2 parents 030ea3c + ff2928e
commit 9da03c4
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,19 @@ Most recent releases are shown at the top. Each release shows:
 - **Changed**: Additional parameters, changes to inputs or outputs, etc
 - **Fixed**: Bug fixes that don't change documented behaviour
 
+## 0.23.1 (2020-10-26)
+
+### New:
+- N/A
+
+### Changed
+- N/A
+
+
+### Fixed:
+- Resolved issue in `qa.ask` method occuring with embedding computations when full answer sentences exceeding 512 tokens.
+
+
 ## 0.23.0 (2020-10-16)
 
 ### New:

diff --git a/ktrain/text/data.py b/ktrain/text/data.py
@@ -44,6 +44,7 @@ def texts_from_folder(datadir, classes=None,
                         This is simply supplied as the categories argument
                         to sklearn's load_files function.
         max_features (int):  maximum number of unigrams to consider
+                             Note: This is only used for preprocess_mode='standard'.
         maxlen (int):  maximum length of tokens in document
         ngram_range (int):  If > 1, will include 2=bigrams, 3=trigrams and bigrams
         train_test_names (list):  list of strings represnting the subfolder
@@ -160,6 +161,7 @@ def texts_from_csv(train_filepath,
                                10% of documents in training CSV will be
                                used for testing/validation.
         max_features(int): max num of words to consider in vocabulary
+                           Note: This is only used for preprocess_mode='standard'.
         maxlen(int): each document can be of most <maxlen> words. 0 is used as padding ID.
         ngram_range(int): size of multi-word phrases to consider
                           e.g., 2 will consider both 1-word phrases and 2-word phrases
@@ -237,7 +239,8 @@ def texts_from_df(train_df,
         val_df(dataframe): file path to test dataframe.  If not supplied,
                                10% of documents in training df will be
                                used for testing/validation.
-        max_features(int): max num of words to consider in vocabulary
+        max_features(int): max num of words to consider in vocabulary.
+                           Note: This is only used for preprocess_mode='standard'.
         maxlen(int): each document can be of most <maxlen> words. 0 is used as padding ID.
         ngram_range(int): size of multi-word phrases to consider
                           e.g., 2 will consider both 1-word phrases and 2-word phrases
@@ -315,6 +318,7 @@ def texts_from_array(x_train, y_train, x_test=None, y_test=None,
         class_names (list): list of strings representing class labels
                             shape should be (num_examples,1) or (num_examples,)
         max_features(int): max num of words to consider in vocabulary
+                           Note: This is only used for preprocess_mode='standard'.
         maxlen(int): each document can be of most <maxlen> words. 0 is used as padding ID.
         ngram_range(int): size of multi-word phrases to consider
                           e.g., 2 will consider both 1-word phrases and 2-word phrases

diff --git a/ktrain/text/eda.py b/ktrain/text/eda.py
@@ -181,6 +181,7 @@ def get_document_topic_distribution(self):
         """
         Gets the document-topic distribution.
         Each row is a document and each column is a topic
+        The output of this method is equivalent to invoking get_doctopics with no arguments.
         """
         self._check_build()
         return self.doc_topics
@@ -334,7 +335,11 @@ def get_docs(self, topic_ids=[], doc_ids=[], rank=False):
     def get_doctopics(self,  topic_ids=[], doc_ids=[]):
         """
         Returns a topic probability distribution for documents
-        with primary topic that is one of <topic_ids>
+        with primary topic that is one of <topic_ids> and with doc_id in <doc_ids>.
+
+        If no topic_ids or doc_ids are provided, then topic distributions for all documents
+        are returned (which equivalent to the output of get_document_topic_distribution).
+
         Args:
             topic_ids(list of ints): list of topid IDs where each id is in the range
                                      of range(self.n_topics).

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -1217,13 +1217,14 @@ def _load_pretrained(self, model_name):
 
 
 
-    def embed(self, texts, word_level=True):
+    def embed(self, texts, word_level=True, max_length=512):
         """
         get embedding for word, phrase, or sentence
         Args:
           text(str|list): word, phrase, or sentence or list of them representing a batch
           word_level(bool): If True, returns embedding for each token in supplied texts.
                             If False, returns embedding for each text in texts
+          max_length(int): max length of tokens
         Returns:
             np.ndarray : embeddings
         """
@@ -1234,12 +1235,16 @@ def embed(self, texts, word_level=True):
         for text in texts:
             sentences.append(self.tokenizer.tokenize(text))
         maxlen = len(max([tokens for tokens in sentences], key=len,)) + 2
+        if maxlen > max_length: maxlen = max_length # added due to issue #270
+        sentences = []
+
         all_input_ids = []
         all_input_masks = []
         for text in texts:
             tokens = self.tokenizer.tokenize(text)
             if len(tokens) > maxlen - 2:
                 tokens = tokens[0 : (maxlen - 2)]
+            sentences.append(tokens)
             tokens = [self.tokenizer.cls_token] + tokens + [self.tokenizer.sep_token]
             input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
             input_mask = [1] * len(input_ids)

diff --git a/ktrain/text/qa/core.py b/ktrain/text/qa/core.py
@@ -78,6 +78,9 @@ def search(self, query):
         pass
 
     def predict_squad(self, documents, question):
+        """ 
+        Generates candidate answers to the <question> provided given <documents> as contexts.
+        """
         if isinstance(documents, str): documents = [documents]
         sequences = [[question, d] for d in documents]
         batch = self.tokenizer.batch_encode_plus(sequences, return_tensors='tf', max_length=512, truncation='only_second', padding=True)

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.23.0'
+__version__ = '0.23.1'