In [12]:
from pathlib import Path
import csv

In [11]:
translations_fn = Path.cwd().parent / "reports" / "translations_comparison.csv"
assert translations_fn.exists(), translations_fn

In [13]:
with open(translations_fn, 'r', newline='') as file:
    reader = csv.DictReader(file)
    translations = list(reader)

In [15]:
len(translations), translations[0]

(839,
 {'Source': 'བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །',
  'target_gt': "I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows.",
  '01_zero_shot': "I respectfully bow to the Dharmakaya of the Sugatas and their spiritual heirs,\nAnd to all who are worthy of veneration.\nI will now briefly explain, in accordance with scripture,\nHow to enter the vows of the Sugatas' heirs.",
  '03_few_shot_advance': "I respectfully prostrate to the Sugatas who possess the dharmakāya along with their offspring,\nAnd also to all who are worthy of veneration.\nI shall explain in brief, according to the scriptures,\nThe way to enter the vows of the Sugatas' offspring.",
  '02_few_shot': "I respectfully prostrate to the Sugatas who po

In [19]:
import sacrebleu
from sacrebleu.metrics import BLEU, CHRF, TER
from typing import List, Tuple, Dict

class TextSimilarityAnalyzer:
    def __init__(self):
        self.bleu = BLEU()
        self.chrf = CHRF(word_order=2)  # chrF++ uses word_order=2
        self.ter = TER()

    def calculate_metrics(self, reference: str, hypothesis: str) -> Dict[str, float]:
        """
        Calculate BLEU, chrF++, and TER scores for a single pair of texts.

        Args:
            reference: Reference text
            hypothesis: Hypothesis text to compare against reference

        Returns:
            Dictionary containing the three metric scores
        """
        # Convert single strings to lists as required by sacrebleu
        refs = [reference]
        hyps = [hypothesis]

        # Calculate scores
        bleu_score = self.bleu.corpus_score(hyps, [refs]).score / 100  # Normalize to 0-1
        chrf_score = self.chrf.corpus_score(hyps, [refs]).score / 100  # Normalize to 0-1
        ter_score = self.ter.corpus_score(hyps, [refs]).score / 100    # Normalize to 0-1

        return {
            'bleu': bleu_score,
            'chrf': chrf_score,
            'ter': ter_score
        }

    def find_n_examples_by_similarity(self,
                                    reference: str,
                                    candidates: List[str],
                                    n: int = 3,
                                    weights: Dict[str, float] = None) -> Dict:
        """
        Find n most similar, n most dissimilar, and n moderate similarity candidates.

        Args:
            reference: Reference text
            candidates: List of candidate texts to compare
            n: Number of examples to return for each category
            weights: Optional dictionary with weights for each metric (default: equal weights)

        Returns:
            Dictionary containing most_similar, moderate, and most_dissimilar lists with their scores
        """
        if weights is None:
            weights = {'bleu': 1/3, 'chrf': 1/3, 'ter': 1/3}

        # Ensure n doesn't exceed one-third of the candidates
        n = min(n, len(candidates) // 3)

        candidate_scores = []
        for candidate in candidates:
            metrics = self.calculate_metrics(reference, candidate)

            # Calculate weighted average score (invert TER as lower is better)
            weighted_score = (
                weights['bleu'] * metrics['bleu'] +
                weights['chrf'] * metrics['chrf'] +
                weights['ter'] * (1 - metrics['ter'])  # Invert TER score
            )

            candidate_scores.append((candidate, weighted_score, metrics))

        # Sort by weighted score
        candidate_scores.sort(key=lambda x: x[1])

        def create_result_entry(item):
            return {
                'text': item[0],
                'weighted_score': item[1],
                'metrics': item[2]
            }

        # Get n most dissimilar (lowest scores)
        most_dissimilar = [create_result_entry(candidate_scores[i])
                          for i in range(n)]

        # Get n most similar (highest scores)
        most_similar = [create_result_entry(candidate_scores[-(i+1)])
                       for i in range(n)]

        # Get n moderate examples from the middle
        middle_start = (len(candidate_scores) - n) // 2
        moderate = [create_result_entry(candidate_scores[middle_start + i])
                   for i in range(n)]

        return {
            'most_similar': most_similar,
            'moderate': moderate,
            'most_dissimilar': most_dissimilar
        }

# Example usage
if __name__ == "__main__":
    analyzer = TextSimilarityAnalyzer()

    reference = "The quick brown fox jumps over the lazy dog."
    candidates = [
        "The quick brown fox jumps over the lazy dog.",      # Identical
        "The fast brown fox leaps over the tired dog.",      # Similar
        "A dog sleeps on the ground.",                       # Very different
        "The quick brown fox jumps over a lazy dog.",        # Minor difference
        "The rapid brown fox hops across the sleepy dog.",   # Somewhat similar
        "A cat chases a mouse.",                             # Very different
        "The brown fox quickly jumped over lazy dogs.",      # Moderately similar
        "Dogs are lazy animals.",                            # Very different
        "A fox and a dog play in the garden.",              # Moderate
        "The agile brown fox jumps past the lazy dog.",      # Similar
        "Birds fly in the blue sky.",                        # Very different
        "The quick brown fox jumped over lazy dogs.",        # Similar
        "Animals run in the field.",                         # Very different
        "A brown fox and a lazy dog in the park.",          # Moderate
        "The swift fox leaped above the drowsy hound."      # Moderate
    ]

    # Optional: custom weights for metrics
    weights = {
        'bleu': 0.4,
        'chrf': 0.4,
        'ter': 0.2
    }

    # Get 3 examples for each category
    results = analyzer.find_n_examples_by_similarity(
        reference, candidates, n=3, weights=weights
    )

    print(f"Reference: {reference}\n")

    print("Top 3 Most Similar Examples:")
    for i, result in enumerate(results['most_similar'], 1):
        print(f"\n{i}. Text: {result['text']}")
        print(f"   Scores: BLEU={result['metrics']['bleu']:.3f}, "
              f"chrF++={result['metrics']['chrf']:.3f}, "
              f"TER={result['metrics']['ter']:.3f}")

    print("\nTop 3 Moderate Similarity Examples:")
    for i, result in enumerate(results['moderate'], 1):
        print(f"\n{i}. Text: {result['text']}")
        print(f"   Scores: BLEU={result['metrics']['bleu']:.3f}, "
              f"chrF++={result['metrics']['chrf']:.3f}, "
              f"TER={result['metrics']['ter']:.3f}")

    print("\nTop 3 Most Dissimilar Examples:")
    for i, result in enumerate(results['most_dissimilar'], 1):
        print(f"\n{i}. Text: {result['text']}")
        print(f"   Scores: BLEU={result['metrics']['bleu']:.3f}, "
              f"chrF++={result['metrics']['chrf']:.3f}, "
              f"TER={result['metrics']['ter']:.3f}")

Reference: The quick brown fox jumps over the lazy dog.

Top 3 Most Similar Examples:

1. Text: The quick brown fox jumps over the lazy dog.
   Scores: BLEU=1.000, chrF++=1.000, TER=0.000

2. Text: The quick brown fox jumps over a lazy dog.
   Scores: BLEU=0.658, chrF++=0.842, TER=0.111

3. Text: The agile brown fox jumps past the lazy dog.
   Scores: BLEU=0.393, chrF++=0.620, TER=0.222

Top 3 Moderate Similarity Examples:

1. Text: The swift fox leaped above the drowsy hound.
   Scores: BLEU=0.060, chrF++=0.198, TER=0.667

2. Text: A brown fox and a lazy dog in the park.
   Scores: BLEU=0.117, chrF++=0.370, TER=0.889

3. Text: The rapid brown fox hops across the sleepy dog.
   Scores: BLEU=0.131, chrF++=0.355, TER=0.444

Top 3 Most Dissimilar Examples:

1. Text: A cat chases a mouse.
   Scores: BLEU=0.042, chrF++=0.056, TER=1.000

2. Text: Animals run in the field.
   Scores: BLEU=0.050, chrF++=0.100, TER=0.889

3. Text: Birds fly in the blue sky.
   Scores: BLEU=0.051, chrF++=0.099, 