Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(evaluate): add signwriting evaluation metrics #1104

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [3.1.38]

### Changed

- Added support for [signwriting-evaluation](https://github.com/sign-language-processing/signwriting-evaluation) to
allow evaluating SignWriting text translation outputs.

## [3.1.37]

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion sockeye/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '3.1.37'
__version__ = '3.1.38'
37 changes: 22 additions & 15 deletions sockeye/checkpoint_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,25 +181,32 @@ def decode_and_evaluate(self, output_name: Optional[str] = None) -> Dict[str, fl
self.model.train(original_mode)

# 2. Evaluate

metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=translations[0],
references=self.targets_sentences[0],
hypotheses = translations[0]
references = self.targets_sentences[0]
metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=hypotheses, references=references,
offset=evaluate.DEFAULT_OFFSET),
C.CHRF: evaluate.raw_corpus_chrf(hypotheses=translations[0],
references=self.targets_sentences[0]),
C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=translations[0],
references=self.targets_sentences[0]),
C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=translations[0],
references=self.targets_sentences[0]),
C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=translations[0],
references=self.targets_sentences[0]),
C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=translations[0],
references=self.targets_sentences[0]),
C.TER: evaluate.raw_corpus_ter(hypotheses=translations[0],
references=self.targets_sentences[0]),
C.CHRF: evaluate.raw_corpus_chrf(hypotheses=hypotheses, references=references),
C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=hypotheses, references=references),
C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=hypotheses, references=references),
C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=hypotheses, references=references),
C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=hypotheses, references=references),
C.TER: evaluate.raw_corpus_ter(hypotheses=hypotheses, references=references),
C.AVG_TIME: avg_time,
C.DECODING_TIME: trans_wall_time}

# Add SignWriting Evaluation Metrics if the module is available
try:
import signwriting_evaluation
metrics.update({
C.SIGNWRITING_CLIP: evaluate.raw_corpus_signwriting_clip(
hypotheses_factors=translations,
references_factors=self.targets_sentences),
C.SIGNWRITING_SIMILARITY: evaluate.raw_corpus_signwriting_similarity(
hypotheses_factors=translations,
references_factors=self.targets_sentences)})
except ModuleNotFoundError:
pass

if len(translations) > 1: # metrics for other target factors
for i, _ in enumerate(translations[1:], 1):
# only BLEU
Expand Down
14 changes: 9 additions & 5 deletions sockeye/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,15 +261,19 @@
ROUGEL = 'rougel'
BOW_PERPLEXITY = 'bow-perplexity'
TER = 'ter'
SIGNWRITING_CLIP = 'signwriting-clip'
SIGNWRITING_SIMILARITY = 'signwriting-similarity'
LENRATIO = 'length-ratio-mse'
AVG_TIME = "avg-sec-per-sent"
DECODING_TIME = "decode-walltime"
METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER]
METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER,
SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, CHRF: True, ROUGE1: True, PERPLEXITY: False, LENRATIO_MSE: False,
TER: False, BOW_PERPLEXITY: False}
METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf, TER: np.inf}
METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER]
EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER]
TER: False, BOW_PERPLEXITY: False, SIGNWRITING_CLIP: True, SIGNWRITING_SIMILARITY: True}
METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf,
TER: np.inf, SIGNWRITING_CLIP: -1.0, SIGNWRITING_SIMILARITY: 0.0}
METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]

# loss
CROSS_ENTROPY = 'cross-entropy'
Expand Down
71 changes: 70 additions & 1 deletion sockeye/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import logging
import sys
from collections import defaultdict
from functools import partial
from functools import partial, lru_cache
from typing import Callable, Iterable, Dict, List, Tuple, Optional

import numpy as np
Expand Down Expand Up @@ -118,6 +118,71 @@ def raw_corpus_length_ratio(hypotheses: Iterable[str], references: Iterable[str]
return sum(ratios)/len(ratios) if len(ratios) else 0.0


def serialize_factors(factors: List[Iterable[str]]) -> List[str]:
factors_list = zip(*factors)
for factors in factors_list:
factors_tokens = [f.strip().split(" ") for f in factors]
inverse_factors = zip(*factors_tokens)
yield " ".join([" ".join(f) for f in inverse_factors])


def detokenize_signwriting(strings: List[str]) -> List[str]:
from signwriting.tokenizer import SignWritingTokenizer
tokenizer = SignWritingTokenizer()
signwriting_texts = [tokenizer.tokens_to_text(s.split(" ")) for s in strings]
# Regex Replace ([RBLM])00 with the capture group
import re
return [re.sub(r"([RBLM])00", r"\1", s) for s in signwriting_texts]


def raw_corpus_signwriting_similarity(hypotheses_factors: List[Iterable[str]],
references_factors: List[Iterable[str]]) -> float:
"""
Simple wrapper around the signwriting-evaluation similarity score.

:param hypotheses_factors: Hypothesis factors streams.
:param references_factors: Reference factors streams.
:return: Similarity score as float between 0 and 1.
"""
try:
from signwriting_evaluation.metrics.similarity import SignWritingSimilarityMetric
except ImportError:
raise ImportError("Please install signwriting-evaluation to use the SignWriting Similarity metric.")

hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors))
references = detokenize_signwriting(serialize_factors(references_factors))

metric = SignWritingSimilarityMetric()
return metric.corpus_score(hypotheses, [references])


@lru_cache(maxsize=1)
def load_signwriting_clip():
try:
from signwriting_evaluation.metrics.clip import SignWritingCLIPScore
except ImportError:
raise ImportError("Please install signwriting-evaluation to use the SignWriting CLIP metric.")

# Not using cache_directory to avoid multiple processes accessing at the same time
return SignWritingCLIPScore(cache_directory=None)


def raw_corpus_signwriting_clip(hypotheses_factors: List[Iterable[str]],
references_factors: List[Iterable[str]]) -> float:
"""
Simple wrapper around the signwriting-evaluation clip score.

:param hypotheses_factors: Hypothesis factors streams.
:param references_factors: Reference factors streams.
:return: CLIPScore score as float between -1 and 1.
"""
metric = load_signwriting_clip()

hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors))
references = detokenize_signwriting(serialize_factors(references_factors))
return metric.corpus_score(hypotheses, [references])


def main():
params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with '
'respect to a reference set. If multiple hypotheses files are given '
Expand Down Expand Up @@ -163,6 +228,10 @@ def main():
func = raw_corpus_rougel
elif name == C.TER:
func = raw_corpus_ter
elif name == C.SIGNWRITING_CLIP:
func = raw_corpus_signwriting_clip
elif name == C.SIGNWRITING_SIMILARITY:
func = raw_corpus_signwriting_similarity
else:
raise ValueError("Unknown metric %s." % name)
metrics.append((name, func))
Expand Down