Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Jul 20, 2020
2 parents 9cdbc22 + f1e7f49 commit d2499a5
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 9 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@ Most recent releases are shown at the top. Each release shows:
- **Changed**: Additional parameters, changes to inputs or outputs, etc
- **Fixed**: Bug fixes that don't change documented behaviour

## 0.18.5 (2020-07-20)

### New:
- N/A

### Changed
- N/A

### Fixed:
- Changed `qa` module to use use 'Auto' when loading `QuestionAnswering` models and tokenizer
- try `from_pt=True` for `qa` module if initial model-loading fails
- use `get_hf_model_name` in `qa` module


## 0.18.4 (2020-07-17)

### New:
Expand Down
28 changes: 20 additions & 8 deletions ktrain/text/qa/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
from whoosh.qparser import QueryParser


from transformers import TFBertForQuestionAnswering
from transformers import BertTokenizer
#from transformers import TFBertForQuestionAnswering
#from transformers import BertTokenizer
from transformers import TFAutoModelForQuestionAnswering
from transformers import AutoTokenizer
LOWCONF = -10000


Expand All @@ -23,8 +25,11 @@ class QA(ABC):
def __init__(self, bert_squad_model='bert-large-uncased-whole-word-masking-finetuned-squad',
bert_emb_model='bert-base-uncased'):
self.model_name = bert_squad_model
self.model = TFBertForQuestionAnswering.from_pretrained(self.model_name)
self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
try:
self.model = TFAutoModelForQuestionAnswering.from_pretrained(self.model_name)
except:
self.model = TFAutoModelForQuestionAnswering.from_pretrained(self.model_name, from_pt=True)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.maxlen = 512
self.te = tpp.TransformerEmbedding(bert_emb_model, layers=[-2])

Expand All @@ -43,12 +48,19 @@ def predict_squad(self, document, question):
assert len(segment_ids) == len(input_ids)
n_ids = len(segment_ids)
if n_ids < self.maxlen:
start_scores, end_scores = self.model(np.array([input_ids]),
token_type_ids=np.array([segment_ids]))
input_ids = np.array([input_ids])
token_type_ids = np.array([segment_ids])
else:
#TODO: use different truncation strategies or run multiple inferences
start_scores, end_scores = self.model(np.array([input_ids[:self.maxlen]]),
token_type_ids=np.array([segment_ids[:self.maxlen]]))
input_ids = np.array([input_ids[:self.maxlen]])
token_type_ids = np.array([segment_ids[:self.maxlen]])

# Added from: https://github.com/huggingface/transformers/commit/16ce15ed4bd0865d24a94aa839a44cf0f400ef50
if U.get_hf_model_name(self.model_name) in ['xlm', 'roberta', 'distilbert']:
start_scores, end_scores = self.model(input_ids)
else:
start_scores, end_scores = self.model(input_ids, token_type_ids=token_type_ids)

start_scores = start_scores[:,1:-1]
end_scores = end_scores[:,1:-1]
answer_start = np.argmax(start_scores)
Expand Down
13 changes: 13 additions & 0 deletions ktrain/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,3 +492,16 @@ def list2chunks(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


def get_hf_model_name(model_id):
parts = model_id.split('/')
if len(parts) == 1:
model_id = parts[0]
else:
model_id = '/'.join(parts[1:])
if model_id.startswith('xlm-roberta'):
model_name = 'xlm-roberta'
else:
model_name = model_id.split('-')[0]
return model_name

2 changes: 1 addition & 1 deletion ktrain/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ['__version__']
__version__ = '0.18.4'
__version__ = '0.18.5'

0 comments on commit d2499a5

Please sign in to comment.