In [None]:
import os, json, re
import pandas as pd
from tqdm import tqdm
from math import ceil
from openai import AzureOpenAI

BASE_DIR = "/content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/newResult"  # or your Drive path if mounted
OUTPUT_FOLDER = f"{BASE_DIR}/eval_outputs"
DEBUG_FILE = f"{OUTPUT_FOLDER}/AutomationMetric_Logs.json"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

API_KEY = "<YOUR_KEY>"
ENDPOINT = "https://<your-resource>.openai.azure.com/"
MODEL =  "gpt-5-chat"
API_VER = "2024-12-01-preview"

# ====== PROMPT (scores only; English -> Indonesian evaluation) ======
PROMPT_HEADER = """
Evaluate the Indonesian translation of the English text below based on five criteria: Adequacy, Fluency, Morphosyntactic, Semantic, and Pragmatic. Use the scoring rubric from 1 to 5 for each criterion, as described in detail below. Assign only a numeric score for each aspect (no explanations).

Scoring Rubrics:
1. Adequacy: Does the Indonesian translation convey the same meaning as the English source sentence?
- Scoring Guide:
5: Complete transfer of meaning (accurate and all important information conveyed)
4: Most meaning conveyed with minor omissions or errors
3: Some parts are missing or incorrect, but some information still conveyed
2: Major omissions or incorrect meaning, only small part of the information is conveyed
1: Meaning largely incorrect or missing

2. Fluency: How natural and grammatically correct is the Indonesian translation?
- Scoring Guide:
5: Fluent, natural, and grammatically correct
4: Mostly fluent with minor grammatical issues or awkwardness
3: Understandable, but noticeable grammatical errors
2: Difficult to understand due to poor grammar or awkward phrasing
1: Unintelligible due to major grammatical issues

3. Morphosyntactic (Grammar, agreement, sentence structure, morphology):
- Scoring Guide:
5: No grammatical or syntactic errors. The sentence structure is fluent, and consistent with the source data. Morphological forms (e.g., tense, agreement, inflections) are fully accurate.
4: Minor grammatical or syntactic issues that do not hinder comprehension. Word forms and sentence structure are mostly correct but slightly unnatural in places.
3: Noticeable morphosyntactic errors that occasionally affect comprehension or fluency. May include agreement mistakes, awkward word order, different sentence structure, or wrong tense.
2: Frequent errors in grammar and structure that hinder comprehension. The sentence structure differs significantly from the source data. Unnatural phrasing, tense confusion, or broken sentence patterns are common.
1: Sentence is ungrammatical, fragmented, or unparseable. Morphosyntactic errors make it incomprehensible or entirely ungrammatical. The sentence structure is completely different from the source data.

4. Semantic (Meaning preservation, omissions, mistranslations):
- Scoring Guide:
5: Full preservation of meaning. No omissions, distortions, or mistranslations. The translation conveys the same message as the source without deleting or adding additional words.
4: Minor meaning shifts or vague expressions that slightly alter nuance but do not mislead. No omissions for verbs, subject, and object of the sentence.
3: Partial meaning loss. Some ideas are omitted or inaccurately rendered, but core meaning is still recoverable with effort.
2: Major meaning loss or distortion. Critical elements are missing, incorrect, or misleading. Intended message is mostly lost.
1: Little semantic correspondence to the source. Mostly incorrect, irrelevant, or hallucinated content.

5. Pragmatic (Tone, register, politeness, cultural/situational appropriateness):
- Scoring Guide:
5: Tone, register, and cultural fit are fully appropriate for the context. The translation sounds natural and is aligned with the speakers intent.
4: Mostly appropriate tone and register. Slight mismatches (e.g., slightly too formal/informal), but do not cause misunderstanding or awkwardness.
3: Inconsistent or ambiguous tone or formality. Some sections may sound awkward or misaligned with the source intent.
2: Inappropriate tone or register for the context. Translation may sound offensive, robotic, or culturally inappropriate.
1: Completely wrong pragmatic use. For example, overly rude, sarcastic instead of sincere, or completely mismatched social function.


You will receive a list of examples. For each example, return a JSON object with:
- "row_id": the given ID
- "scores": { "adequacy": int, "fluency": int, "morphosyntactic": int, "semantic": int, "pragmatic": int }

IMPORTANT OUTPUT FORMAT:
Return a single JSON object with the key "evaluations" whose value is an array of the example results.
Do not include any extra text, prose, or code fences—only valid JSON.
"""

def build_batch_prompt(batch_rows, source_col, translation_col):
    """
    batch_rows: list of dicts with '_row_id' (string) and all original columns.
    source_col: column containing English source text (e.g., 'source_en')
    translation_col: column containing Indonesian translation (e.g., 'translation')
    """
    examples = []
    for r in batch_rows:
        examples.append({
            "row_id": str(r["_row_id"]),
            "english_source": str(r[source_col]),
            "indonesian_translation": str(r[translation_col])
        })
    return PROMPT_HEADER + "\n\n" + json.dumps({"examples": examples}, ensure_ascii=False)

def eval_csv_batched(
    input_csv,
    amount=None,
    batch_size=20,
    source_col="source_en",        # <-- English source column
    translation_col="translation", # <-- Indonesian translation column
    id_cols=("source_en", "translation"),  # used to detect already-processed rows
    save_debug=False,
    print_debug=False
):
    if not os.path.exists(input_csv):
        print(f"❌ File not found: {input_csv}")
        return

    output_csv = os.path.join(
        OUTPUT_FOLDER,
        os.path.basename(input_csv).replace(".csv", f"_amount({amount})_batch({batch_size})_(GPT5)_NewRubric.csv")
    )

    df = pd.read_csv(input_csv)
    df.columns = df.columns.str.strip()
    if amount not in (None, float("inf")):
        df = df[:amount]

    # New metric columns to add
    metric_cols = ['adequacy','fluency','morphosyntactic','semantic','pragmatic']

    # Prepare/align output frame (keep all input columns + metrics)
    if os.path.exists(output_csv):
        out_df = pd.read_csv(output_csv)
        # Drop legacy metrics if any slipped in
        for legacy in ['bleu_score','meteor_score']:
            if legacy in out_df.columns:
                out_df = out_df.drop(columns=[legacy])
        for c in df.columns:
            if c not in out_df.columns:
                out_df[c] = None
        for c in metric_cols:
            if c not in out_df.columns:
                out_df[c] = None
        out_df = out_df[[*df.columns, *metric_cols]]
    else:
        out_df = pd.DataFrame(columns=[*df.columns, *metric_cols])

    # Helper: detect already processed rows by id_cols
    def already_done(row):
        if out_df.empty: return False
        mask = pd.Series([True] * len(out_df))
        for col in id_cols:
            mask &= (out_df[col].astype(str) == str(row[col]))
        return bool(mask.any())

    # Build list of rows to evaluate; attach internal _row_id based on original index
    rows = []
    for idx, row in df.iterrows():
        if not already_done(row):
            d = row.to_dict()
            d["_row_id"] = str(idx)
            rows.append(d)

    if not rows:
        print("✅ Nothing to do; all rows already evaluated.")
        return

    client = AzureOpenAI(api_key=API_KEY, azure_endpoint=ENDPOINT, api_version=API_VER)

    new_rows, debug_logs = [], []
    total_batches = ceil(len(rows)/batch_size)

    for b in tqdm(range(total_batches), desc=f"Evaluating {os.path.basename(input_csv)}"):
        batch = rows[b*batch_size:(b+1)*batch_size]
        prompt = build_batch_prompt(batch, source_col, translation_col)

        try:
            resp = client.chat.completions.create(
                model=MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0, top_p=0.0,
                response_format={"type": "json_object"}
            )
            content = resp.choices[0].message.content.strip()
            parsed = json.loads(content)
            evals = parsed.get("evaluations", [])
        except Exception as e:
            print(f"Batch {b+1}/{total_batches} error: {e}")
            try:
                m = re.search(r'\{.*\}', content, re.DOTALL)
                parsed = json.loads(m.group(0)) if m else {}
                evals = parsed.get("evaluations", [])
            except Exception:
                evals = []

        # Map back by row_id
        by_id = {str(it.get("row_id")): it for it in evals}

        for r in batch:
            rid = str(r["_row_id"])
            it = by_id.get(rid)
            if not it:
                continue
            scores = it.get("scores", {})

            # Keep only original input columns; do not persist _row_id
            base = {c: r[c] for c in df.columns}

            # Add metrics
            for c in metric_cols:
                base[c] = scores.get(c)

            new_rows.append(base)

        if save_debug or print_debug:
            debug_logs.append({
                "batch_index": b,
                "row_ids": [str(r["_row_id"]) for r in batch],
                "raw_response": content if 'content' in locals() else None
            })

    if new_rows:
        add_df = pd.DataFrame(new_rows)

        # Ensure final columns: all original input columns + metric columns
        for c in df.columns:
            if c not in add_df.columns:
                add_df[c] = None
        for c in metric_cols:
            if c not in add_df.columns:
                add_df[c] = None
        add_df = add_df[[*df.columns, *metric_cols]]

        # Align existing out_df to the same columns/order
        for c in add_df.columns:
            if c not in out_df.columns:
                out_df[c] = None
        out_df = out_df[add_df.columns]

        out_df = pd.concat([out_df, add_df], ignore_index=True)
        out_df.to_csv(output_csv, index=False)
        print(f"✅ Saved: {output_csv}")

        # quick averages for this run
        for col in metric_cols:
            s = pd.to_numeric(add_df[col], errors='coerce').dropna()
            print(f"Avg {col}: {s.mean():.2f}" if len(s) else f"Avg {col}: n/a")
    else:
        print("⚠️ No new evaluations were added.")

    if save_debug:
        with open(DEBUG_FILE, "w", encoding="utf-8") as f:
            json.dump(debug_logs, f, indent=2, ensure_ascii=False)

In [None]:
input_csv=["/content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/data/translated_with_gemma3_1b_EN_ID_0_1000_bleu_meteor.csv",
           "/content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/data/translated_with_llama3.2_3b_EN_ID_0_1000_bleu_meteor.csv",
           "/content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/data/translated_with_qwen3_0.6b_EN_ID_0_1000_bleu_meteor.csv"
           ]

for csv in input_csv:
  eval_csv_batched(csv, amount=1000, batch_size=10, save_debug=True, print_debug=False)

Evaluating translated_with_gemma3_1b_EN_ID_0_1000_bleu_meteor.csv: 100%|██████████| 100/100 [04:24<00:00,  2.65s/it]
  out_df = pd.concat([out_df, add_df], ignore_index=True)


✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/newResult/eval_outputs/translated_with_gemma3_1b_EN_ID_0_1000_bleu_meteor_amount(1000)_batch(10)_(GPT5)_NewRubric.csv
Avg adequacy: 4.08
Avg fluency: 4.26
Avg morphosyntactic: 4.26
Avg semantic: 4.08
Avg pragmatic: 4.40


Evaluating translated_with_llama3.2_3b_EN_ID_0_1000_bleu_meteor.csv: 100%|██████████| 100/100 [04:22<00:00,  2.62s/it]
  out_df = pd.concat([out_df, add_df], ignore_index=True)


✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/newResult/eval_outputs/translated_with_llama3.2_3b_EN_ID_0_1000_bleu_meteor_amount(1000)_batch(10)_(GPT5)_NewRubric.csv
Avg adequacy: 3.75
Avg fluency: 3.83
Avg morphosyntactic: 3.83
Avg semantic: 3.75
Avg pragmatic: 4.06


Evaluating translated_with_qwen3_0.6b_EN_ID_0_1000_bleu_meteor.csv: 100%|██████████| 100/100 [04:44<00:00,  2.85s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/newResult/eval_outputs/translated_with_qwen3_0.6b_EN_ID_0_1000_bleu_meteor_amount(1000)_batch(10)_(GPT5)_NewRubric.csv
Avg adequacy: 2.74
Avg fluency: 3.10
Avg morphosyntactic: 3.10
Avg semantic: 2.74
Avg pragmatic: 3.25



  out_df = pd.concat([out_df, add_df], ignore_index=True)


In [None]:
input_csv=["/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv",
           "/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_llama3.2_1b_ted_end_id_2018_with_bleu.csv",
           "/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_qwen3_0.6b_ted_end_id_2018_with_bleu_clean_recomputed.csv"
           ]

for csv in input_csv:
  eval_csv_batched(csv, amount=1000, batch_size=10, save_debug=True, print_debug=False)

Evaluating translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 100/100 [03:50<00:00,  2.30s/it]
  out_df = pd.concat([out_df, add_df], ignore_index=True)


✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/result/eval_outputs/translated_with_gemma3_1b_ted_end_id_2018_with_bleu_amount(1000)_batch(10)_(GPT5)_NewRubric.csv
Avg adequacy: 4.10
Avg fluency: 4.31
Avg morphosyntactic: 4.31
Avg semantic: 4.09
Avg pragmatic: 4.44


Evaluating translated_with_llama3.2_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 100/100 [03:50<00:00,  2.31s/it]
  out_df = pd.concat([out_df, add_df], ignore_index=True)


✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/result/eval_outputs/translated_with_llama3.2_1b_ted_end_id_2018_with_bleu_amount(1000)_batch(10)_(GPT5)_NewRubric.csv
Avg adequacy: 2.53
Avg fluency: 2.66
Avg morphosyntactic: 2.66
Avg semantic: 2.53
Avg pragmatic: 2.81


Evaluating translated_with_qwen3_0.6b_ted_end_id_2018_with_bleu_clean_recomputed.csv: 100%|██████████| 100/100 [03:40<00:00,  2.20s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/result/eval_outputs/translated_with_qwen3_0.6b_ted_end_id_2018_with_bleu_clean_recomputed_amount(1000)_batch(10)_(GPT5)_NewRubric.csv
Avg adequacy: 2.75
Avg fluency: 3.07
Avg morphosyntactic: 3.07
Avg semantic: 2.75
Avg pragmatic: 3.24



  out_df = pd.concat([out_df, add_df], ignore_index=True)


In [None]:
input_csv="/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv"
eval_csv_batched(input_csv, amount=20, batch_size=10, save_debug=True, print_debug=False)

Evaluating translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 2/2 [00:04<00:00,  2.27s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/Testing Batch/eval_outputs/translated_with_gemma3_1b_ted_end_id_2018_with_bleu_amount(20)_batch(10)_(GPT5)_NewRubric.csv
Avg adequacy: 4.00
Avg fluency: 4.55
Avg morphosyntactic: 4.55
Avg semantic: 4.00
Avg pragmatic: 4.65



  out_df = pd.concat([out_df, add_df], ignore_index=True)


In [None]:
input_csv="/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv"
eval_csv_batched(input_csv, amount=20, batch_size=10, save_debug=False, print_debug=False)

Evaluating translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 2/2 [00:06<00:00,  3.04s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/Testing Batch/eval_outputs/translated_with_gemma3_1b_ted_end_id_2018_with_bleu_amount20_batch(10)_(GPT5).csv
Avg adequacy: 3.90
Avg fluency: 4.50
Avg morphosyntactic: 4.50
Avg semantic: 3.90
Avg pragmatic: 4.45



  out_df = pd.concat([out_df, add_df], ignore_index=True)


In [None]:
input_csv="/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv"
eval_csv_batched(input_csv, amount=20, batch_size=5, save_debug=False, print_debug=False)

Evaluating translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 4/4 [00:05<00:00,  1.25s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/Testing Batch/eval_outputs/translated_with_gemma3_1b_ted_end_id_2018_with_bleu_amount20_batch(5)_(GPT5).csv
Avg adequacy: 3.95
Avg fluency: 4.30
Avg morphosyntactic: 4.30
Avg semantic: 3.95
Avg pragmatic: 4.30



  out_df = pd.concat([out_df, add_df], ignore_index=True)


In [None]:
input_csv="/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv"
eval_csv_batched(input_csv, amount=20, batch_size=15, save_debug=False, print_debug=False)

Evaluating translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 2/2 [00:04<00:00,  2.24s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/Testing Batch/eval_outputs/translated_with_gemma3_1b_ted_end_id_2018_with_bleu_amount20_batch(15)_(GPT5).csv
Avg adequacy: 4.00
Avg fluency: 4.40
Avg morphosyntactic: 4.40
Avg semantic: 4.00
Avg pragmatic: 4.15



  out_df = pd.concat([out_df, add_df], ignore_index=True)


In [None]:
input_csv="/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv"
eval_csv_batched(input_csv, amount=20, batch_size=20, save_debug=False, print_debug=False)

Evaluating translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 1/1 [00:04<00:00,  4.47s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/Testing Batch/eval_outputs/translated_with_gemma3_1b_ted_end_id_2018_with_bleu_amount20_batch(20)_(GPT5).csv
Avg adequacy: 4.05
Avg fluency: 4.45
Avg morphosyntactic: 4.45
Avg semantic: 4.05
Avg pragmatic: 4.20



  out_df = pd.concat([out_df, add_df], ignore_index=True)


In [None]:
input_csv="/content/drive/MyDrive/MT/english to indonesia/bleu/translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv"
eval_csv_batched(input_csv, amount=100, batch_size=100, save_debug=False, print_debug=False)

Evaluating translated_with_gemma3_1b_ted_end_id_2018_with_bleu.csv: 100%|██████████| 1/1 [00:41<00:00, 41.10s/it]

✅ Saved: /content/drive/MyDrive/MT/english to indonesia/Evaluator GPT-5-Chat/Testing Batch/eval_outputs/translated_with_gemma3_1b_ted_end_id_2018_with_bleu_amount(100)_batch(100)_(GPT5).csv
Avg adequacy: 4.21
Avg fluency: 4.31
Avg morphosyntactic: 4.31
Avg semantic: 4.21
Avg pragmatic: 4.36



  out_df = pd.concat([out_df, add_df], ignore_index=True)
