In [None]:
import json
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk

# BLEU için gerekli
nltk.download('punkt')

# 1. Sonuçları Yükle
with open("/kaggle/working/mistral_zero_shot_results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

# 2. Metrik Hesaplayıcıları
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smooth = SmoothingFunction().method1

rouge1_scores, rouge2_scores, rougeL_scores, bleu_scores = [], [], [], []

for res in results:
    ref = res["doctor_answer"]
    hyp = res["model_answer"]

    # ROUGE
    scores = rouge.score(ref, hyp)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

    # BLEU (kelime kelime tokenizasyon)
    ref_tokens = nltk.word_tokenize(ref)
    hyp_tokens = nltk.word_tokenize(hyp)
    bleu = sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smooth)
    bleu_scores.append(bleu)

# 3. Ortalama Skorları Hesapla ve Yazdır
print(f"ROUGE-1 F1: {sum(rouge1_scores)/len(rouge1_scores):.4f}")
print(f"ROUGE-2 F1: {sum(rouge2_scores)/len(rouge2_scores):.4f}")
print(f"ROUGE-L F1: {sum(rougeL_scores)/len(rougeL_scores):.4f}")
print(f"BLEU:       {sum(bleu_scores)/len(bleu_scores):.4f}")
