Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Embed quality-scores as HTML tag attributes #358

Merged
merged 14 commits into from
Feb 25, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/tests/common-impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,13 @@ void TestSuite<Service>::qualityEstimatorWords(Ptr<TranslationModel> model) {
std::string source = readFromStdin();
const Response response = bridge_.translate(service_, model, std::move(source), responseOptions);

for (const auto &sentenceQualityEstimate : response.qualityScores) {
for (size_t sentenceIdx = 0; sentenceIdx < response.qualityScores.size(); ++sentenceIdx) {
const auto &sentenceQualityEstimate = response.qualityScores[sentenceIdx];
std::cout << "[SentenceBegin]\n";

for (const auto &wordByteRange : sentenceQualityEstimate.wordByteRanges) {
for (const auto &wordRange : sentenceQualityEstimate.wordRanges) {
const ByteRange wordByteRange{response.target.wordAsByteRange(sentenceIdx, wordRange.begin).begin,
response.target.wordAsByteRange(sentenceIdx, wordRange.end).begin};
const string_view word(response.target.text.data() + wordByteRange.begin, wordByteRange.size());
std::cout << word << "\n";
}
Expand Down
10 changes: 10 additions & 0 deletions src/translator/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ struct ByteRange {
bool operator==(ByteRange other) const { return begin == other.begin && end == other.end; }
};

/// A Subword range is mechanically the same as a `ByteRange`, but instead of
/// describing a span of bytes, it describes a span of Subword tokens. Using
/// `Annotation.word()` you can switch between the two.
struct SubwordRange {
size_t begin;
size_t end;
const size_t size() const { return end - begin; }
bool operator==(SubwordRange other) const { return begin == other.begin && end == other.end; }
};

class Response;
using CallbackType = std::function<void(Response &&)>;

Expand Down
99 changes: 78 additions & 21 deletions src/translator/html.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#include "html.h"

#include <vector>

#include "response.h"
#include "translator/definitions.h"
#include "xh_scanner.h"

namespace {
Expand Down Expand Up @@ -28,12 +31,12 @@ void encodeEntities(string_view const &input, std::string &output) {
// case ???:
// output.append("&nbsp;");
// break;
// case '"':
// output.append("&quot;");
// break;
// case '\'':
// output.append("&apos;");
// break;
case '"':
output.append("&quot;");
break;
case '\'':
output.append("&apos;");
break;
default:
output.push_back(*it);
break;
Expand Down Expand Up @@ -454,7 +457,12 @@ void HTML::restore(Response &response) {
copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
assert(targetTokenSpans.size() == debugCountTokens(response.target));

AnnotatedText target = restoreTarget(response.target, targetTokenSpans);
// Take the spans, and use them to make a taint for every word in the
// translation. Optionally add extra tags, like quality score metadata.
std::vector<HTML::Taint> targetTokenTags;
annotateTaint(response, targetTokenSpans, targetTokenTags);

AnnotatedText target = restoreTarget(response.target, targetTokenSpans, targetTokenTags);

response.source = source;
response.target = target;
Expand Down Expand Up @@ -500,37 +508,37 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
});
}

AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
auto prevSpan = spans_.cbegin();
AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans,
std::vector<Taint> const &targetTokenTags) {
auto prevTags = spans_.cbegin()->tags;
auto stragglerSpanIt = spans_.cbegin();
auto targetSpanIt = targetTokenSpans.begin();
auto targetTagIt = targetTokenTags.begin();

AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);

// First we scan through spans_ to catch up to the span assigned to this
// token. We're only interested in empty spans (empty and void elements)
for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
for (; stragglerSpanIt < *targetSpanIt; stragglerSpanIt++) {
// We're only interested in empty spans or spans that would otherwise get
// lost because they didn't align with anything between the spans in
// targetSpanIt
// TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
if (span_it->size() != 0 &&
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
if (stragglerSpanIt->size() != 0 &&
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), stragglerSpanIt) != targetTokenSpans.end())
continue;

formatter.append(prevSpan->tags, span_it->tags);

// Note: here, not in 3rd part of for-statement because we don't want to
// set prevSpan if the continue clause at the beginning of this for-loop
// was hit.
prevSpan = span_it;
formatter.append(prevTags, stragglerSpanIt->tags);
prevTags = stragglerSpanIt->tags;
}

// Now do the same thing but for our target set of tags. Note that we cannot
// combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
// because there is no guarantee that the order in `targetTokenSpans` is
// the same as that of `spans`.
formatter.append(prevSpan->tags, (*targetSpanIt)->tags);

formatter.append(prevTags, *targetTagIt);

// If this is the last token of the response, close all open tags.
if (last) {
Expand All @@ -539,11 +547,12 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanItera
// the last token of the output. But lets assume someone someday changes
// HardAlignments(), and then this for-loop will be necessary.
// assert((*targetSpanIt)->tags.empty());
formatter.append((*targetSpanIt)->tags, HTML::Taint());
formatter.append(*targetTagIt, HTML::Taint());
}

prevSpan = *targetSpanIt;
prevTags = *targetTagIt;
++targetSpanIt;
++targetTagIt;

return std::move(formatter.html());
});
Expand Down Expand Up @@ -580,6 +589,54 @@ void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>>
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for ending whitespace
}

void HTML::annotateTaint(Response const &response, std::vector<SpanIterator> const &targetTokenSpans,
std::vector<HTML::Taint> &targetTokenTags) {
auto spanIt = targetTokenSpans.begin();
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
// Sentence prefix
targetTokenTags.push_back((*spanIt)->tags);
spanIt++;

// Offset in targetTokenTags at which this sentence's tags start.
size_t tagOffset = targetTokenTags.size();

// Initially, just copy the span's tags to this token
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
targetTokenTags.emplace_back((*spanIt)->tags);
spanIt++;
}

// If we have quality score information, add that as metadata as well.
if (!response.qualityScores.empty()) {
auto const &sentenceQuality = response.qualityScores[sentenceIdx];
// Create a single <font> tag for this sentence with sentence level info
Tag *sentenceTag = makeTag({Tag::ELEMENT, "font"});
sentenceTag->attributes += format(" x-bergamot-sentence-score=\"{}\"", sentenceQuality.sentenceScore);

// Add that tag to all tokens in this sentence.
for (size_t tokenIdx = 0; tokenIdx < response.target.numWords(sentenceIdx); ++tokenIdx) {
targetTokenTags[tagOffset + tokenIdx].push_back(sentenceTag);
}

// Add word level <font> tags as well to all tokens that make up a word.
for (size_t wordIdx = 0; wordIdx < sentenceQuality.wordRanges.size(); ++wordIdx) {
Tag *wordTag = makeTag({Tag::ELEMENT, "font"});
wordTag->attributes += format(" x-bergamot-word-score=\"{}\"", sentenceQuality.wordScores[wordIdx]);
auto const &range = sentenceQuality.wordRanges[wordIdx];
for (size_t tokenIdx = range.begin; tokenIdx < range.end; ++tokenIdx) {
targetTokenTags[tagOffset + tokenIdx].push_back(wordTag);
}
}
}
}
jerinphilip marked this conversation as resolved.
Show resolved Hide resolved

// Suffix
targetTokenTags.push_back((*spanIt)->tags);
spanIt++;

assert(spanIt == targetTokenSpans.end());
}

// Reports if token `str` is likely to be a continuation of a word. This is used
// to determine whether we should share the markup, or whether we should see
// this token as a fresh start. This implementation will treat "hello[world]"
Expand Down
7 changes: 5 additions & 2 deletions src/translator/html.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,17 @@ class HTML {
void restore(Response &response);

private:
using SpanIterator = std::vector<HTML::Span>::const_iterator;
using SpanIterator = std::vector<HTML::Span>::iterator;
using AnnotatedText = marian::bergamot::AnnotatedText;

AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans,
std::vector<HTML::Taint> const &targetTokenTags);
void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<HTML::SpanIterator> const &sourceTokenSpans,
std::vector<HTML::SpanIterator> &targetTokenSpans);
void annotateTaint(Response const &response, std::vector<SpanIterator> const &targetTokenSpans,
std::vector<HTML::Taint> &targetTokenTags);
void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
bool isContinuation(string_view prev, string_view str);
// Allocates tag in pool_ (which then owns it) and gives a pointer to be used
Expand Down
22 changes: 2 additions & 20 deletions src/translator/quality_estimator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Response::SentenceQualityScore UnsupervisedQualityEstimator::computeSentenceScor
const float sentenceScore =
std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();

return {wordScores, subwordToWords(wordIndices, target, sentenceIdx), sentenceScore};
return {wordScores, wordIndices, sentenceScore};
}

LogisticRegressorQualityEstimator::Matrix::Matrix(const size_t rowsParam, const size_t colsParam)
Expand Down Expand Up @@ -160,7 +160,7 @@ Response::SentenceQualityScore LogisticRegressorQualityEstimator::computeSentenc
const float sentenceScore =
std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();

return {wordScores, subwordToWords(wordIndices, target, sentenceIdx), sentenceScore};
return {wordScores, wordIndices, sentenceScore};
jelmervdl marked this conversation as resolved.
Show resolved Hide resolved
}

std::vector<float> LogisticRegressorQualityEstimator::predict(const Matrix& features) const {
Expand Down Expand Up @@ -267,22 +267,4 @@ std::vector<SubwordRange> mapWords(const std::vector<float>& logProbs, const Ann
return wordIndices;
}

std::vector<ByteRange> subwordToWords(const std::vector<SubwordRange>& wordIndices, const AnnotatedText& target,
const size_t sentenceIdx) {
std::vector<ByteRange> words;

for (const SubwordRange& wordIndice : wordIndices) {
size_t wordBegin = target.wordAsByteRange(sentenceIdx, wordIndice.begin).begin;
size_t wordEnd = target.wordAsByteRange(sentenceIdx, wordIndice.end).begin;

if (isspace(target.text.at(wordBegin))) {
++wordBegin;
}

words.emplace_back(ByteRange{wordBegin, wordEnd});
}

return words;
}

} // namespace marian::bergamot
12 changes: 0 additions & 12 deletions src/translator/quality_estimator.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ class QualityEstimator {
virtual void computeQualityScores(const Histories &histories, Response &response) const = 0;
};

using SubwordRange = ByteRange;

/// Unsupervised Quality Estimator model. It uses the translator model's log probabilities (log probs) as a proxy for
/// quality scores. Then, for a given word, its quality score is computed by taking the mean of the log probs of the
/// tokens that make it up. The sentence score is the mean of all word's log probs.
Expand Down Expand Up @@ -209,14 +207,4 @@ inline std::shared_ptr<QualityEstimator> createQualityEstimator(const AlignedMem
std::vector<SubwordRange> mapWords(const std::vector<float> &logProbs, const AnnotatedText &target,
const size_t sentenceIdx);

/// Given a vector of subwordRanges, it maps the elements to be real words rather than sublevel tokens. The words are
/// represented through ByteRanges.

/// @param [in] wordIndices: A vector where each element correspond to the index of a real word and its values are
/// represented by the SubwordRanges (which are aliases of ByteRanges) which represents sublevel token positions
/// @param [in] target: AnnotatedText target value
/// @param [in] sentenceIdx: the id of a candidate sentence
std::vector<ByteRange> subwordToWords(const std::vector<SubwordRange> &wordIndices, const AnnotatedText &target,
const size_t sentenceIdx);

} // namespace marian::bergamot
4 changes: 2 additions & 2 deletions src/translator/response.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ struct Response {
struct SentenceQualityScore {
/// Quality score of each translated word
std::vector<float> wordScores;
/// Each word position in the translated text
std::vector<ByteRange> wordByteRanges;
/// Position of start and end token of each word in the translated text
std::vector<SubwordRange> wordRanges;
/// Whole sentence quality score (it is composed by the mean of its words)
float sentenceScore = 0.0;
};
Expand Down
33 changes: 31 additions & 2 deletions wasm/bindings/response_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,46 @@
#include "response.h"

using Response = marian::bergamot::Response;
using SentenceQualityScore = marian::bergamot::Response::SentenceQualityScore;
using ByteRange = marian::bergamot::ByteRange;

/// Same type as Response::SentenceQualityScore, except with wordByteRanges
/// instead of wordRanges.
struct SentenceQualityScore {
/// Quality score of each translated word
std::vector<float> wordScores;
/// Position of each word in the translated text
std::vector<ByteRange> wordByteRanges;
/// Whole sentence quality score (it is composed by the mean of its words)
float sentenceScore = 0.0;
};

using namespace emscripten;

// Binding code
EMSCRIPTEN_BINDINGS(byte_range) {
value_object<ByteRange>("ByteRange").field("begin", &ByteRange::begin).field("end", &ByteRange::end);
}

std::vector<SentenceQualityScore> getQualityScores(const Response& response) { return response.qualityScores; }
std::vector<SentenceQualityScore> getQualityScores(const Response& response) {
std::vector<SentenceQualityScore> scores;
scores.reserve(response.qualityScores.size());

for (size_t sentenceIdx = 0; sentenceIdx < response.qualityScores.size(); ++sentenceIdx) {
std::vector<ByteRange> wordByteRanges;
wordByteRanges.reserve(response.qualityScores[sentenceIdx].wordRanges.size());

for (auto&& word : response.qualityScores[sentenceIdx].wordRanges) {
wordByteRanges.emplace_back();
wordByteRanges.back().begin = response.target.wordAsByteRange(sentenceIdx, word.begin).begin;
wordByteRanges.back().end = response.target.wordAsByteRange(sentenceIdx, word.end).begin;
}

scores.emplace_back(SentenceQualityScore{response.qualityScores[sentenceIdx].wordScores, std::move(wordByteRanges),
response.qualityScores[sentenceIdx].sentenceScore});
}

return scores;
}
jelmervdl marked this conversation as resolved.
Show resolved Hide resolved

EMSCRIPTEN_BINDINGS(response) {
class_<Response>("Response")
Expand Down
1 change: 1 addition & 0 deletions wasm/test_page/js/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ const _parseTranslatedTextSentenceQualityScores = (vectorResponse) => {
sentenceQualityScores.push(sentenceQualityScore);
}
result.push(sentenceQualityScores);
vectorSentenceQualityScore.delete();
jelmervdl marked this conversation as resolved.
Show resolved Hide resolved
}
return result;
}
Expand Down