awslabs · AmitMY · Feb 10, 2024 · Feb 10, 2024 · Mar 2, 2024
@@ -11,6 +11,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [3.1.38]
+
+### Changed
+
+- Added support for [signwriting-evaluation](https://github.com/sign-language-processing/signwriting-evaluation) to
+  allow evaluating SignWriting text translation outputs.
+
 ## [3.1.37]
 
 ### Fixed

@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '3.1.37'
+__version__ = '3.1.38'
@@ -181,25 +181,32 @@ def decode_and_evaluate(self, output_name: Optional[str] = None) -> Dict[str, fl
         self.model.train(original_mode)
 
         # 2. Evaluate
-
-        metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=translations[0],
-                                                    references=self.targets_sentences[0],
+        hypotheses = translations[0]
+        references = self.targets_sentences[0]
+        metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=hypotheses, references=references,
                                                     offset=evaluate.DEFAULT_OFFSET),
-                   C.CHRF: evaluate.raw_corpus_chrf(hypotheses=translations[0],
-                                                    references=self.targets_sentences[0]),
-                   C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=translations[0],
-                                                        references=self.targets_sentences[0]),
-                   C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=translations[0],
-                                                        references=self.targets_sentences[0]),
-                   C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=translations[0],
-                                                        references=self.targets_sentences[0]),
-                   C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=translations[0],
-                                                                references=self.targets_sentences[0]),
-                   C.TER: evaluate.raw_corpus_ter(hypotheses=translations[0],
-                                                  references=self.targets_sentences[0]),
+                   C.CHRF: evaluate.raw_corpus_chrf(hypotheses=hypotheses, references=references),
+                   C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=hypotheses, references=references),
+                   C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=hypotheses, references=references),
+                   C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=hypotheses, references=references),
+                   C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=hypotheses, references=references),
+                   C.TER: evaluate.raw_corpus_ter(hypotheses=hypotheses, references=references),
                    C.AVG_TIME: avg_time,
                    C.DECODING_TIME: trans_wall_time}
 
+        # Add SignWriting Evaluation Metrics if the module is available
+        try:
+            import signwriting_evaluation
+            metrics.update({
+                C.SIGNWRITING_CLIP: evaluate.raw_corpus_signwriting_clip(
+                    hypotheses_factors=translations,
+                    references_factors=self.targets_sentences),
+                C.SIGNWRITING_SIMILARITY: evaluate.raw_corpus_signwriting_similarity(
+                    hypotheses_factors=translations,
+                    references_factors=self.targets_sentences)})
+        except ModuleNotFoundError:
+            pass
+
         if len(translations) > 1:  # metrics for other target factors
             for i, _ in enumerate(translations[1:], 1):
                 # only BLEU

@@ -261,15 +261,19 @@
 ROUGEL = 'rougel'
 BOW_PERPLEXITY = 'bow-perplexity'
 TER = 'ter'
+SIGNWRITING_CLIP = 'signwriting-clip'
+SIGNWRITING_SIMILARITY = 'signwriting-similarity'
 LENRATIO = 'length-ratio-mse'
 AVG_TIME = "avg-sec-per-sent"
 DECODING_TIME = "decode-walltime"
-METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER]
+METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER,
+           SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
 METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, CHRF: True, ROUGE1: True, PERPLEXITY: False, LENRATIO_MSE: False,
-                   TER: False, BOW_PERPLEXITY: False}
-METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf, TER: np.inf}
-METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER]
-EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER]
+                   TER: False, BOW_PERPLEXITY: False, SIGNWRITING_CLIP: True, SIGNWRITING_SIMILARITY: True}
+METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf,
+                TER: np.inf, SIGNWRITING_CLIP: -1.0, SIGNWRITING_SIMILARITY: 0.0}
+METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
+EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
 
 # loss
 CROSS_ENTROPY = 'cross-entropy'

@@ -18,7 +18,7 @@
 import logging
 import sys
 from collections import defaultdict
-from functools import partial
+from functools import partial, lru_cache
 from typing import Callable, Iterable, Dict, List, Tuple, Optional
 
 import numpy as np
@@ -118,6 +118,71 @@ def raw_corpus_length_ratio(hypotheses: Iterable[str], references: Iterable[str]
     return sum(ratios)/len(ratios) if len(ratios) else 0.0
 
 
+def serialize_factors(factors: List[Iterable[str]]) -> List[str]:
+    factors_list = zip(*factors)
+    for factors in factors_list:
+        factors_tokens = [f.strip().split(" ") for f in factors]
+        inverse_factors = zip(*factors_tokens)
+        yield " ".join([" ".join(f) for f in inverse_factors])
+
+
+def detokenize_signwriting(strings: List[str]) -> List[str]:
+    from signwriting.tokenizer import SignWritingTokenizer
+    tokenizer = SignWritingTokenizer()
+    signwriting_texts = [tokenizer.tokens_to_text(s.split(" ")) for s in strings]
+    # Regex Replace ([RBLM])00 with the capture group
+    import re
+    return [re.sub(r"([RBLM])00", r"\1", s) for s in signwriting_texts]
+
+
+def raw_corpus_signwriting_similarity(hypotheses_factors: List[Iterable[str]],
+                                      references_factors: List[Iterable[str]]) -> float:
+    """
+    Simple wrapper around the signwriting-evaluation similarity score.
+
+    :param hypotheses_factors: Hypothesis factors streams.
+    :param references_factors: Reference factors streams.
+    :return: Similarity score as float between 0 and 1.
+    """
+    try:
+        from signwriting_evaluation.metrics.similarity import SignWritingSimilarityMetric
+    except ImportError:
+        raise ImportError("Please install signwriting-evaluation to use the SignWriting Similarity metric.")
+
+    hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors))
+    references = detokenize_signwriting(serialize_factors(references_factors))
+
+    metric = SignWritingSimilarityMetric()
+    return metric.corpus_score(hypotheses, [references])
+
+
+@lru_cache(maxsize=1)
+def load_signwriting_clip():
+    try:
+        from signwriting_evaluation.metrics.clip import SignWritingCLIPScore
+    except ImportError:
+        raise ImportError("Please install signwriting-evaluation to use the SignWriting CLIP metric.")
+
+    # Not using cache_directory to avoid multiple processes accessing at the same time
+    return SignWritingCLIPScore(cache_directory=None)
+
+
+def raw_corpus_signwriting_clip(hypotheses_factors: List[Iterable[str]],
+                                references_factors: List[Iterable[str]]) -> float:
+    """
+    Simple wrapper around the signwriting-evaluation clip score.
+
+    :param hypotheses_factors: Hypothesis factors streams.
+    :param references_factors: Reference factors streams.
+    :return: CLIPScore score as float between -1 and 1.
+    """
+    metric = load_signwriting_clip()
+
+    hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors))
+    references = detokenize_signwriting(serialize_factors(references_factors))
+    return metric.corpus_score(hypotheses, [references])
+
+
 def main():
     params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with '
                                                  'respect to a reference set. If multiple hypotheses files are given '
@@ -163,6 +228,10 @@ def main():
             func = raw_corpus_rougel
         elif name == C.TER:
             func = raw_corpus_ter
+        elif name == C.SIGNWRITING_CLIP:
+            func = raw_corpus_signwriting_clip
+        elif name == C.SIGNWRITING_SIMILARITY:
+            func = raw_corpus_signwriting_similarity
         else:
             raise ValueError("Unknown metric %s." % name)
         metrics.append((name, func))