In [3]:
%pip install evaluate datasets pandas matplotlib rouge_score bert_score




In [4]:
# 🔁 Drive einbinden
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from datasets import load_dataset, Dataset
from evaluate import load
import pandas as pd
import json
import os

# --- Pfade ---
BASE_DIR = "/content/drive/MyDrive/ma-colab"
DATA_PATH = f"{BASE_DIR}/merged_predictions_with_no_context_final.json"
RESULTS_PATH = f"{BASE_DIR}/results/alttext_metrics_comparison.csv"

# --- Spalten definieren ---
REF_COLUMN = "openai_alt_text_refined"
BASELINE_COLUMN = "generated_baseline"
FT_COLUMN = "generated_finetuned"
BASELINE_NOCTX_COLUMN = "generated_baseline_no_context"
FT_NOCTX_COLUMN = "generated_finetuned_no_context"

# --- Dataset laden ---
with open(DATA_PATH, "r", encoding="utf-8") as f:
    examples = [json.loads(line) for line in f]

ds = Dataset.from_list(examples)
print(f"✅ Dataset geladen mit {len(ds)} Beispielen")

# --- Metriken laden ---
bleu = load("bleu")
meteor = load("meteor")
rouge = load("rouge")
bertscore = load("bertscore")

# --- Evaluation ---
def evaluate_all_metrics(preds, refs):
    refs = [[r] for r in refs]
    results = {}

    results["BLEU"] = bleu.compute(predictions=preds, references=refs)["bleu"]
    results["METEOR"] = meteor.compute(predictions=preds, references=[r[0] for r in refs])["meteor"]
    rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])
    results["ROUGE-L"] = rouge_result["rougeL"]

    bert = bertscore.compute(
        predictions=preds,
        references=[r[0] for r in refs],
        lang="en",
        batch_size=8
    )
    results["BERTScore (P)"] = sum(bert["precision"]) / len(bert["precision"])
    results["BERTScore (R)"] = sum(bert["recall"]) / len(bert["recall"])
    results["BERTScore (F1)"] = sum(bert["f1"]) / len(bert["f1"])

    return results

# --- Berechnung ---
print("🔍 Evaluierung Baseline...")
results_baseline = evaluate_all_metrics(ds[BASELINE_COLUMN], ds[REF_COLUMN])
print("🔍 Evaluierung Fine-Tuned...")
results_finetuned = evaluate_all_metrics(ds[FT_COLUMN], ds[REF_COLUMN])
print("🔍 Evaluierung Baseline no Context...")
results_baseline_no_context = evaluate_all_metrics(ds[BASELINE_NOCTX_COLUMN], ds[REF_COLUMN])
print("🔍 Evaluierung Fine-Tuned no Context...")
results_finetuned_no_context = evaluate_all_metrics(ds[FT_NOCTX_COLUMN], ds[REF_COLUMN])

# --- Ergebnisvergleich ---
df_results = pd.DataFrame([results_baseline, results_finetuned, results_baseline_no_context, results_finetuned_no_context],
                          index=["Baseline", "Fine-Tuned", "Baseline No Context", "Fine-Tuned No Context"]).round(4)

print("\n📊 Vergleich der Alt-Text-Metriken:")
print(df_results)

# --- Speichern ---
os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
df_results.to_csv(RESULTS_PATH)
print(f"💾 Ergebnisse gespeichert unter: {RESULTS_PATH}")


✅ Dataset geladen mit 960 Beispielen


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🔍 Evaluierung Baseline...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔍 Evaluierung Fine-Tuned...
🔍 Evaluierung Baseline no Context...
🔍 Evaluierung Fine-Tuned no Context...

📊 Vergleich der Alt-Text-Metriken:
                         BLEU  METEOR  ROUGE-L  BERTScore (P)  BERTScore (R)  \
Baseline               0.1370  0.4199   0.3548         0.8931         0.9052   
Fine-Tuned             0.2397  0.5041   0.4683         0.9283         0.9251   
Baseline No Context    0.0954  0.3348   0.3058         0.8934         0.8882   
Fine-Tuned No Context  0.1271  0.3643   0.3390         0.9041         0.8918   

                       BERTScore (F1)  
Baseline                       0.8990  
Fine-Tuned                     0.9266  
Baseline No Context            0.8907  
Fine-Tuned No Context          0.8978  
💾 Ergebnisse gespeichert unter: /content/drive/MyDrive/ma-colab/results/alttext_metrics_comparison.csv
