Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Oct 26, 2020
2 parents 030ea3c + ff2928e commit 9da03c4
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 4 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ Most recent releases are shown at the top. Each release shows:
- **Changed**: Additional parameters, changes to inputs or outputs, etc
- **Fixed**: Bug fixes that don't change documented behaviour

## 0.23.1 (2020-10-26)

### New:
- N/A

### Changed
- N/A


### Fixed:
- Resolved issue in `qa.ask` method occuring with embedding computations when full answer sentences exceeding 512 tokens.


## 0.23.0 (2020-10-16)

### New:
Expand Down
6 changes: 5 additions & 1 deletion ktrain/text/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def texts_from_folder(datadir, classes=None,
This is simply supplied as the categories argument
to sklearn's load_files function.
max_features (int): maximum number of unigrams to consider
Note: This is only used for preprocess_mode='standard'.
maxlen (int): maximum length of tokens in document
ngram_range (int): If > 1, will include 2=bigrams, 3=trigrams and bigrams
train_test_names (list): list of strings represnting the subfolder
Expand Down Expand Up @@ -160,6 +161,7 @@ def texts_from_csv(train_filepath,
10% of documents in training CSV will be
used for testing/validation.
max_features(int): max num of words to consider in vocabulary
Note: This is only used for preprocess_mode='standard'.
maxlen(int): each document can be of most <maxlen> words. 0 is used as padding ID.
ngram_range(int): size of multi-word phrases to consider
e.g., 2 will consider both 1-word phrases and 2-word phrases
Expand Down Expand Up @@ -237,7 +239,8 @@ def texts_from_df(train_df,
val_df(dataframe): file path to test dataframe. If not supplied,
10% of documents in training df will be
used for testing/validation.
max_features(int): max num of words to consider in vocabulary
max_features(int): max num of words to consider in vocabulary.
Note: This is only used for preprocess_mode='standard'.
maxlen(int): each document can be of most <maxlen> words. 0 is used as padding ID.
ngram_range(int): size of multi-word phrases to consider
e.g., 2 will consider both 1-word phrases and 2-word phrases
Expand Down Expand Up @@ -315,6 +318,7 @@ def texts_from_array(x_train, y_train, x_test=None, y_test=None,
class_names (list): list of strings representing class labels
shape should be (num_examples,1) or (num_examples,)
max_features(int): max num of words to consider in vocabulary
Note: This is only used for preprocess_mode='standard'.
maxlen(int): each document can be of most <maxlen> words. 0 is used as padding ID.
ngram_range(int): size of multi-word phrases to consider
e.g., 2 will consider both 1-word phrases and 2-word phrases
Expand Down
7 changes: 6 additions & 1 deletion ktrain/text/eda.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def get_document_topic_distribution(self):
"""
Gets the document-topic distribution.
Each row is a document and each column is a topic
The output of this method is equivalent to invoking get_doctopics with no arguments.
"""
self._check_build()
return self.doc_topics
Expand Down Expand Up @@ -334,7 +335,11 @@ def get_docs(self, topic_ids=[], doc_ids=[], rank=False):
def get_doctopics(self, topic_ids=[], doc_ids=[]):
"""
Returns a topic probability distribution for documents
with primary topic that is one of <topic_ids>
with primary topic that is one of <topic_ids> and with doc_id in <doc_ids>.
If no topic_ids or doc_ids are provided, then topic distributions for all documents
are returned (which equivalent to the output of get_document_topic_distribution).
Args:
topic_ids(list of ints): list of topid IDs where each id is in the range
of range(self.n_topics).
Expand Down
7 changes: 6 additions & 1 deletion ktrain/text/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1217,13 +1217,14 @@ def _load_pretrained(self, model_name):



def embed(self, texts, word_level=True):
def embed(self, texts, word_level=True, max_length=512):
"""
get embedding for word, phrase, or sentence
Args:
text(str|list): word, phrase, or sentence or list of them representing a batch
word_level(bool): If True, returns embedding for each token in supplied texts.
If False, returns embedding for each text in texts
max_length(int): max length of tokens
Returns:
np.ndarray : embeddings
"""
Expand All @@ -1234,12 +1235,16 @@ def embed(self, texts, word_level=True):
for text in texts:
sentences.append(self.tokenizer.tokenize(text))
maxlen = len(max([tokens for tokens in sentences], key=len,)) + 2
if maxlen > max_length: maxlen = max_length # added due to issue #270
sentences = []

all_input_ids = []
all_input_masks = []
for text in texts:
tokens = self.tokenizer.tokenize(text)
if len(tokens) > maxlen - 2:
tokens = tokens[0 : (maxlen - 2)]
sentences.append(tokens)
tokens = [self.tokenizer.cls_token] + tokens + [self.tokenizer.sep_token]
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
Expand Down
3 changes: 3 additions & 0 deletions ktrain/text/qa/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def search(self, query):
pass

def predict_squad(self, documents, question):
"""
Generates candidate answers to the <question> provided given <documents> as contexts.
"""
if isinstance(documents, str): documents = [documents]
sequences = [[question, d] for d in documents]
batch = self.tokenizer.batch_encode_plus(sequences, return_tensors='tf', max_length=512, truncation='only_second', padding=True)
Expand Down
2 changes: 1 addition & 1 deletion ktrain/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ['__version__']
__version__ = '0.23.0'
__version__ = '0.23.1'

0 comments on commit 9da03c4

Please sign in to comment.