In [None]:
!pip install -q transformers torch pandas sentence-transformers bert-score tensorflow tensorflow-text tensorflow-hub sacrebleu nltk

# Second Cell - Import libraries and download NLTK data
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from bert_score import BERTScorer
import tensorflow_hub as hub
import tensorflow_text
from tqdm.notebook import tqdm
from sacrebleu.metrics import BLEU
import nltk
from nltk.translate.meteor_score import meteor_score
from transformers import MarianMTModel, MarianTokenizer,AutoTokenizer, AutoModel
from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[?25h

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
#When the generated response is different than the gold answer language, we need to translate the response to the target language first

source_lang = 'vi'
target_lang = 'en'

class HelsinkiTranslator:
    def __init__(self, source_lang, target_lang):
        """
        Initialize Helsinki NLP translator
        source_lang and target_lang should be in ISO 639-1 format (e.g., 'en', 'de', 'fr')
        """
        model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
        print(f"Loading translation model: {model_name}")

        try:
            self.tokenizer =MarianTokenizer.from_pretrained(model_name)
            self.model = MarianMTModel.from_pretrained(model_name)
            self.translator = pipeline("translation", model=self.model, tokenizer=self.tokenizer)
        except Exception as e:
            print(f"Error loading model {model_name}: {str(e)}")
            print("Attempting to load alternative model path...")
            # Try alternative model path format
            alt_model_name = f'Helsinki-NLP/opus-mt-{source_lang}+{source_lang}-{target_lang}+{target_lang}'
            self.tokenizer = AutoTokenizer.from_pretrained(alt_model_name)
            self.model = AutoModelForSeq2SeqGeneration.from_pretrained(alt_model_name)
            self.translator = pipeline("translation", model=self.model, tokenizer=self.tokenizer)

    def translate(self, text):
        """Translate text using Helsinki NLP model"""
        try:
            result = self.translator(text, max_length=1024)
            return result[0]['translation_text']
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text

class CrossLingualEvaluator:
    def __init__(self):
        """Initialize evaluation models"""
        print("Loading evaluation models...")

        # Load Sentence-BERT
        self.sbert = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

        # Load BERTScore
        self.bert_scorer = BERTScorer(
            model_type="xlm-roberta-large",
            num_layers=17,
            rescale_with_baseline=False
        )

        # Initialize BLEU with smoothing
        self.bleu = BLEU(smooth_method='exp')

        print("Models loaded successfully!")

    def calculate_metrics(self, text1, text2):
        """Calculate all metrics for a pair of texts"""
        results = {}

        # SBERT similarity
        emb1 = self.sbert.encode(text1, convert_to_numpy=True)
        emb2 = self.sbert.encode(text2, convert_to_numpy=True)
        results['sbert_similarity'] = float(np.dot(emb1, emb2) /
                                         (np.linalg.norm(emb1) * np.linalg.norm(emb2)))

        # BERTScore
        P, R, F1 = self.bert_scorer.score([text1], [text2])
        results['bertscore_precision'] = float(P[0])
        results['bertscore_recall'] = float(R[0])
        results['bertscore_f1'] = float(F1[0])

        # BLEU
        results['bleu_score'] = self.bleu.corpus_score([text1], [[text2]]).score / 100.0

        # METEOR
        try:
            results['meteor_score'] = meteor_score([text2.split()], text1.split())
        except:
            results['meteor_score'] = 0.0

        return results

# Fourth Cell - Evaluation function
def evaluate_dataset(df, source_lang, target_lang, gen_col='generated_answer',
                    gold_col='gold_answer', sample_size=None):
    """
    Evaluate the dataset using Helsinki translation and multiple metrics
    """
    # Initialize translator and evaluator
    translator = HelsinkiTranslator(source_lang, target_lang)
    evaluator = CrossLingualEvaluator()

    # Sample dataset if specified
    if sample_size and sample_size < len(df):
        df = df.sample(n=sample_size, random_state=42)

    all_results = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
        try:
            # Translate generated text
            translated_gen = translator.translate(row[gen_col])

            # Calculate metrics
            metrics = evaluator.calculate_metrics(translated_gen, row[gold_col])
            #metrics = evaluator.calculate_metrics(row[gen_col], row[gold_col])
            # Store results
            result = {
                'original_generated': row[gen_col],
                'translated_generated': translated_gen,
                'gold_answer': row[gold_col],
                **metrics
            }
            all_results.append(result)

            # Print progress every 20 samples
            if (idx + 1) % 20 == 0:
                print(f"\nProcessed {idx + 1}/{len(df)} samples")
                print("Last sample results:")
                for metric, value in metrics.items():
                    print(f"{metric:20s}: {value:.4f}")
                print("\nExample translation:")
                print(f"Original: {row[gen_col][:100]}...")
                print(f"Translated: {translated_gen[:100]}...")
                print(f"Gold: {row[gold_col][:100]}...")

        except Exception as e:
            print(f"Error processing row {idx}: {str(e)}")
            continue

    return pd.DataFrame(all_results)

In [None]:
csv_path = 'path to your csv file'
df = pd.read_csv(csv_path, sep=',', encoding='utf-8', quotechar='"', engine='python')

In [None]:
results_df = evaluate_dataset(
    df,
    source_lang=source_lang,
    target_lang=target_lang,
    gen_col='generated_answer',
    gold_col='gold_answer'
)

Loading translation model: Helsinki-NLP/opus-mt-vi-en


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Loading evaluation models...
Models loaded successfully!


Processing:   0%|          | 0/160 [00:00<?, ?it/s]

Error processing row 8: 'float' object is not subscriptable
Translation error: index out of range in self

Processed 20/160 samples
Last sample results:
sbert_similarity    : 0.7407
bertscore_precision : 0.9112
bertscore_recall    : 0.9177
bertscore_f1        : 0.9145
bleu_score          : 0.1284
meteor_score        : 0.2783

Example translation:
Original: ibm content navigator v2.0.3 đang thêm phần mở rộng.dat vào tệp csv trong quá trình tải xuống tài li...
Translated: The ibm content navigator v2.0.3 is adding the extension.dat to the Csv file in the download process...
Gold: If the mime type of the document as shown in system properties in ICN is not text/csv, then Navigato...
Translation error: index out of range in self

Processed 40/160 samples
Last sample results:
sbert_similarity    : 0.5673
bertscore_precision : 0.8749
bertscore_recall    : 0.8508
bertscore_f1        : 0.8626
bleu_score          : 0.0157
meteor_score        : 0.1388

Example translation:
Original: lỗi stackove

Token indices sequence length is longer than the specified maximum sequence length for this model (1524 > 512). Running this sequence through the model will result in indexing errors
Your input_length: 1524 is bigger than 0.9 * max_length: 1024. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Translation error: index out of range in self
Error processing row 50: 'float' object is not subscriptable

Processed 60/160 samples
Last sample results:
sbert_similarity    : 0.7743
bertscore_precision : 0.8373
bertscore_recall    : 0.8674
bertscore_f1        : 0.8521
bleu_score          : 0.0173
meteor_score        : 0.1838

Example translation:
Original: sự khác biệt giữa db2 z/os v9.1 và db2 luw v10.5 fp7 là do sự thay đổi trong cách tính toán số lượng...
Translated: The difference between db2 z/os v9.1 and db2 luw v10.5 fp7 is due to change in calculating the numbe...
Gold: DB2 LUW precompile, prep write different code to DB2 z/OS?...

Processed 80/160 samples
Last sample results:
sbert_similarity    : 0.7919
bertscore_precision : 0.9200
bertscore_recall    : 0.9177
bertscore_f1        : 0.9189
bleu_score          : 0.0924
meteor_score        : 0.3243

Example translation:
Original: cách đơn giản nhất là dừng eventreader, mở và chỉnh sửa nó, nhấn nút "clear state", và khởi động lạ

Your input_length: 1418 is bigger than 0.9 * max_length: 1024. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Translation error: index out of range in self


Your input_length: 1498 is bigger than 0.9 * max_length: 1024. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Translation error: index out of range in self

Processed 120/160 samples
Last sample results:
sbert_similarity    : 0.6497
bertscore_precision : 0.8322
bertscore_recall    : 0.9217
bertscore_f1        : 0.8746
bleu_score          : 0.0939
meteor_score        : 0.4205

Example translation:
Original: ##begin_quote##ibm security bulletin: a vulnerability in ibm java runtime affects ibm websphere mq (...
Translated: ##begin_quote##ibm security bulletin: a vulnerability in ibm java runtime affects ibm websphere mq (...
Gold: CVEID: CVE-2016-3485 [http://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2016-3485]
DESCRIPTION: An u...


Your input_length: 1410 is bigger than 0.9 * max_length: 1024. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Translation error: index out of range in self

Processed 140/160 samples
Last sample results:
sbert_similarity    : 0.5763
bertscore_precision : 0.8534
bertscore_recall    : 0.8226
bertscore_f1        : 0.8377
bleu_score          : 0.0196
meteor_score        : 0.1192

Example translation:
Original: để bao gồm mô tả dài jobtask trong bảng jp, bạn cần thực hiện các bước sau: trong bảng "integration"...
Translated: To cover the long description jobtask in the jp table, you need to take the following steps: in the ...
Gold: Maximo 6.x MEA 

1. Go To Integration -> Integration Object 

On the Persistent Fields tab, exclude ...

Processed 160/160 samples
Last sample results:
sbert_similarity    : 0.6603
bertscore_precision : 0.8674
bertscore_recall    : 0.8839
bertscore_f1        : 0.8756
bleu_score          : 0.0336
meteor_score        : 0.7205

Example translation:
Original: các lỗ hổng trong openssl ảnh hưởng đến websphere mq 5.3 cho máy chủ hp nonstop (cve-2017-3735). các...
Translated: 

In [None]:
output_path = 'your output path'
results_df.to_csv(output_path, index=False)
print(f"\nResults saved to: {output_path}")

# Calculate and display correlations
metrics = ['sbert_similarity', 'bertscore_precision', 'bertscore_recall',
          'bertscore_f1', 'bleu_score', 'meteor_score']

print("\nFinal Average Scores:")
print("=" * 50)
for metric in metrics:
    mean_score = results_df[metric].mean()
    std_score = results_df[metric].std()
    print(f"{metric:20s}: {mean_score:.4f} (±{std_score:.4f})")


results_df.to_csv(output_path, index=False)
print(f"\nResults saved to: {output_path}")

# Display correlation matrix
correlation_matrix = results_df[metrics].corr().round(4)
print("\nCorrelation between metrics:")
print("=" * 50)
print(correlation_matrix)

# Optional: Sample of translations
print("\nSample Translations:")
print("=" * 50)
samples = results_df[['original_generated', 'gold_answer']].head(3)
for idx, row in samples.iterrows():
    print(f"\nExample {idx + 1}:")
    print(f"Original: {row['original_generated'][:100]}...")
    #print(f"Translated: {row['translated_generated'][:100]}...")
    print(f"Gold: {row['gold_answer'][:100]}...")


Results saved to: /content/drive/MyDrive/Thesis_LCT/Datasets/TechQA_results/Viet/similarity_results_vimix.csv

Final Average Scores:
sbert_similarity    : 0.5227 (±0.1660)
bertscore_precision : 0.8628 (±0.0301)
bertscore_recall    : 0.8589 (±0.0413)
bertscore_f1        : 0.8603 (±0.0289)
bleu_score          : 0.0300 (±0.0475)
meteor_score        : 0.1782 (±0.1457)

Results saved to: /content/drive/MyDrive/Thesis_LCT/Datasets/TechQA_results/Viet/similarity_results_vimix.csv

Correlation between metrics:
                     sbert_similarity  bertscore_precision  bertscore_recall  \
sbert_similarity               1.0000               0.4816            0.3526   
bertscore_precision            0.4816               1.0000            0.2708   
bertscore_recall               0.3526               0.2708            1.0000   
bertscore_f1                   0.5031               0.7233            0.8599   
bleu_score                     0.3550               0.4356            0.5686   
meteor_scor