Skip to content

Commit

Permalink
[no ci] docs
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed May 11, 2023
1 parent 5ac4c9a commit f1a7613
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 3,240 deletions.
107 changes: 68 additions & 39 deletions docs/text/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -2319,7 +2319,7 @@ <h3>Methods</h3>
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class SimpleQA(QA):
<pre><code class="python">class SimpleQA(ExtractiveQABase):
&#34;&#34;&#34;
SimpleQA: Question-Answering on a list of texts
&#34;&#34;&#34;
Expand Down Expand Up @@ -2604,7 +2604,7 @@ <h3>Methods</h3>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.text.qa.core.QA" href="qa/core.html#ktrain.text.qa.core.QA">QA</a></li>
<li><a title="ktrain.text.qa.extractive_qa.ExtractiveQABase" href="qa/extractive_qa.html#ktrain.text.qa.extractive_qa.ExtractiveQABase">ExtractiveQABase</a></li>
<li>abc.ABC</li>
<li><a title="ktrain.torch_base.TorchBase" href="../torch_base.html#ktrain.torch_base.TorchBase">TorchBase</a></li>
</ul>
Expand Down Expand Up @@ -2942,11 +2942,11 @@ <h3>Methods</h3>
</dl>
<h3>Inherited members</h3>
<ul class="hlist">
<li><code><b><a title="ktrain.text.qa.core.QA" href="qa/core.html#ktrain.text.qa.core.QA">QA</a></b></code>:
<li><code><b><a title="ktrain.text.qa.extractive_qa.ExtractiveQABase" href="qa/extractive_qa.html#ktrain.text.qa.extractive_qa.ExtractiveQABase">ExtractiveQABase</a></b></code>:
<ul class="hlist">
<li><code><a title="ktrain.text.qa.core.QA.ask" href="qa/core.html#ktrain.text.qa.core.QA.ask">ask</a></code></li>
<li><code><a title="ktrain.text.qa.core.QA.predict_squad" href="qa/core.html#ktrain.text.qa.core.QA.predict_squad">predict_squad</a></code></li>
<li><code><a title="ktrain.text.qa.core.QA.quantize_model" href="../torch_base.html#ktrain.torch_base.TorchBase.quantize_model">quantize_model</a></code></li>
<li><code><a title="ktrain.text.qa.extractive_qa.ExtractiveQABase.ask" href="qa/extractive_qa.html#ktrain.text.qa.extractive_qa.ExtractiveQABase.ask">ask</a></code></li>
<li><code><a title="ktrain.text.qa.extractive_qa.ExtractiveQABase.predict_squad" href="qa/extractive_qa.html#ktrain.text.qa.extractive_qa.ExtractiveQABase.predict_squad">predict_squad</a></code></li>
<li><code><a title="ktrain.text.qa.extractive_qa.ExtractiveQABase.quantize_model" href="../torch_base.html#ktrain.torch_base.TorchBase.quantize_model">quantize_model</a></code></li>
</ul>
</li>
</ul>
Expand Down Expand Up @@ -5589,6 +5589,28 @@ <h3>Inherited members</h3>
)
return model

def _reconstruct_word_ids(self, offsets):
&#34;&#34;&#34;
```
Reverse engineer the word_ids.
```
&#34;&#34;&#34;
word_ids = []
last_word_id = -1
last_offset = (-1, -1)
for o in offsets:
if o == (0, 0):
word_ids.append(None)
continue
# must test to see if start is same as last offset start due to xml-roberta quirk with tokens like 070
if o[0] == last_offset[0] or o[0] == last_offset[1]:
word_ids.append(last_word_id)
elif o[0] &gt; last_offset[1]:
last_word_id += 1
word_ids.append(last_word_id)
last_offset = o
return word_ids

def embed(
self,
texts,
Expand All @@ -5599,7 +5621,8 @@ <h3>Inherited members</h3>
):
&#34;&#34;&#34;
```
get embedding for word, phrase, or sentence
Get embedding for word, phrase, or sentence.

Args:
text(str|list): word, phrase, or sentence or list of them representing a batch
word_level(bool): If True, returns embedding for each token in supplied texts.
Expand Down Expand Up @@ -5637,7 +5660,7 @@ <h3>Inherited members</h3>
all_input_ids = []
all_input_masks = []
all_word_ids = []
all_offsets = []
all_offsets = [] # retained but not currently used as of v0.36.1 (#492)
for text in texts:
encoded = self.tokenizer.encode_plus(
text, max_length=maxlen, truncation=True, return_offsets_mapping=True
Expand All @@ -5658,7 +5681,16 @@ <h3>Inherited members</h3>
input_mask.append(0)
all_input_ids.append(input_ids)
all_input_masks.append(input_mask)
all_word_ids.append(encoded.word_ids())
# Note about Issue #492:
# deberta includes preceding space in offfset_mapping (https://www.kaggle.com/code/junkoda/be-aware-of-white-space-deberta-roberta)
# models like bert-base-case produce word_ids that do not correspond to whitespace tokenization (e.g.,&#34;score 99.9%&#34;, &#34;BRUSSELS 1996-08-22&#34;)
# Therefore, we use offset_mappings unless the model is deberta for now.
word_ids = (
encoded.word_ids()
if &#34;deberta&#34; in self.model_name
else self._reconstruct_word_ids(offsets)
)
all_word_ids.append(word_ids)
all_offsets.append(offsets)

all_input_ids = np.array(all_input_ids)
Expand Down Expand Up @@ -5695,29 +5727,22 @@ <h3>Inherited members</h3>
filtered_embedding = []
raw_embedding = raw_embeddings[i]
subvectors = []
last_offset = (-1, -1)
# subwords = [] # debugging
last_word_id = -1
for j in range(len(all_offsets[i])):
if all_word_ids[i][j] is None:
word_id = all_word_ids[i][j]
if word_id is None:
continue
# must test to see if start is same as last offset start due to xml-roberta quirk with tokens like 070
if (
all_offsets[i][j][0] == last_offset[1]
or all_offsets[i][j][0] == last_offset[0]
):
if word_id == last_word_id:
subvectors.append(raw_embedding[j])
# subwords[-1] += texts[i][all_offsets[i][j][0]:all_offsets[i][j][1]] # debugging
last_offset = all_offsets[i][j]
if all_offsets[i][j][0] &gt; last_offset[1]:
# subwords.append(texts[i][all_offsets[i][j][0]:all_offsets[i][j][1]]) # debugging
if word_id &gt; last_word_id:
if len(subvectors) &gt; 0:
if aggregation_strategy == &#34;average&#34;:
filtered_embedding.append(np.mean(subvectors, axis=0))
else:
filtered_embedding.append(subvectors[0])
subvectors = []
subvectors.append(raw_embedding[j])
last_offset = all_offsets[i][j]
last_word_id = word_id
if len(subvectors) &gt; 0:
if aggregation_strategy == &#34;average&#34;:
filtered_embedding.append(np.mean(subvectors, axis=0))
Expand All @@ -5741,7 +5766,8 @@ <h3>Methods</h3>
<span>def <span class="ident">embed</span></span>(<span>self, texts, word_level=True, max_length=512, aggregation_strategy='first', layers=[-2])</span>
</code></dt>
<dd>
<div class="desc"><pre><code>get embedding for word, phrase, or sentence
<div class="desc"><pre><code>Get embedding for word, phrase, or sentence.

Args:
text(str|list): word, phrase, or sentence or list of them representing a batch
word_level(bool): If True, returns embedding for each token in supplied texts.
Expand All @@ -5768,7 +5794,8 @@ <h3>Methods</h3>
):
&#34;&#34;&#34;
```
get embedding for word, phrase, or sentence
Get embedding for word, phrase, or sentence.

Args:
text(str|list): word, phrase, or sentence or list of them representing a batch
word_level(bool): If True, returns embedding for each token in supplied texts.
Expand Down Expand Up @@ -5806,7 +5833,7 @@ <h3>Methods</h3>
all_input_ids = []
all_input_masks = []
all_word_ids = []
all_offsets = []
all_offsets = [] # retained but not currently used as of v0.36.1 (#492)
for text in texts:
encoded = self.tokenizer.encode_plus(
text, max_length=maxlen, truncation=True, return_offsets_mapping=True
Expand All @@ -5827,7 +5854,16 @@ <h3>Methods</h3>
input_mask.append(0)
all_input_ids.append(input_ids)
all_input_masks.append(input_mask)
all_word_ids.append(encoded.word_ids())
# Note about Issue #492:
# deberta includes preceding space in offfset_mapping (https://www.kaggle.com/code/junkoda/be-aware-of-white-space-deberta-roberta)
# models like bert-base-case produce word_ids that do not correspond to whitespace tokenization (e.g.,&#34;score 99.9%&#34;, &#34;BRUSSELS 1996-08-22&#34;)
# Therefore, we use offset_mappings unless the model is deberta for now.
word_ids = (
encoded.word_ids()
if &#34;deberta&#34; in self.model_name
else self._reconstruct_word_ids(offsets)
)
all_word_ids.append(word_ids)
all_offsets.append(offsets)

all_input_ids = np.array(all_input_ids)
Expand Down Expand Up @@ -5864,29 +5900,22 @@ <h3>Methods</h3>
filtered_embedding = []
raw_embedding = raw_embeddings[i]
subvectors = []
last_offset = (-1, -1)
# subwords = [] # debugging
last_word_id = -1
for j in range(len(all_offsets[i])):
if all_word_ids[i][j] is None:
word_id = all_word_ids[i][j]
if word_id is None:
continue
# must test to see if start is same as last offset start due to xml-roberta quirk with tokens like 070
if (
all_offsets[i][j][0] == last_offset[1]
or all_offsets[i][j][0] == last_offset[0]
):
if word_id == last_word_id:
subvectors.append(raw_embedding[j])
# subwords[-1] += texts[i][all_offsets[i][j][0]:all_offsets[i][j][1]] # debugging
last_offset = all_offsets[i][j]
if all_offsets[i][j][0] &gt; last_offset[1]:
# subwords.append(texts[i][all_offsets[i][j][0]:all_offsets[i][j][1]]) # debugging
if word_id &gt; last_word_id:
if len(subvectors) &gt; 0:
if aggregation_strategy == &#34;average&#34;:
filtered_embedding.append(np.mean(subvectors, axis=0))
else:
filtered_embedding.append(subvectors[0])
subvectors = []
subvectors.append(raw_embedding[j])
last_offset = all_offsets[i][j]
last_word_id = word_id
if len(subvectors) &gt; 0:
if aggregation_strategy == &#34;average&#34;:
filtered_embedding.append(np.mean(subvectors, axis=0))
Expand Down

0 comments on commit f1a7613

Please sign in to comment.