ACCURACY

In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
      --------------------------------------- 0.2/12.8 MB 2.4 MB/s eta 0:00:06
     --- ------------------------------------ 1.2/12.8 MB 9.2 MB/s eta 0:00:02
     ----------- ---------------------------- 3.6/12.8 MB 20.6 MB/s eta 0:00:01
     ------------------- -------------------- 6.3/12.8 MB 28.9 MB/s eta 0:00:01
     --------------------------- ------------ 8.7/12.8 MB 32.7 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.8 MB 50.4 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 59.5 MB/s eta 0:00:01
     -------------------------------------


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import re
import statistics

from typing import List, Tuple, Dict

from sentence_transformers import SentenceTransformer, util
import spacy

from rapidfuzz import fuzz
 
from transformers import pipeline


MODEL_NAME = os.getenv("ST_EMBEDDING_MODEL", "sentence-transformers/paraphrase-mpnet-base-v2")
try:
    model = SentenceTransformer(MODEL_NAME)
except Exception:
    print("ERRORE")


SPACY_MODEL = "en_core_web_sm"
try:
    nlp = spacy.load(SPACY_MODEL)
except Exception:
    nlp = spacy.blank("en")
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")


def clean_quotes(text: str) -> str:
    """Normalise quotation marks and apostrophes"""
    text = text.replace("''", '"').replace("``", '"')
    text = text.replace("“", '"').replace("”", '"')
    text = text.replace("’", "'")
    return text


def merge_inside_quotes(sentences: List[str]) -> List[str]:
    """Joins broken sentences within quotation marks"""
    merged = []
    buffer = ""
    inside_quote = False

    for s in sentences:
        quote_count = s.count('"')
        if quote_count % 2 == 1:
            if not inside_quote:
                buffer = s
                inside_quote = True
            else:
                buffer += " " + s
                merged.append(buffer.strip())
                buffer = ""
                inside_quote = False
        else:
            if inside_quote:
                buffer += " " + s
            else:
                merged.append(s)

    if buffer:
        merged.append(buffer.strip())

    return merged


def sentence_split(text: str) -> List[str]:
    if not text:
        return []

    text = clean_quotes(text)

    doc = nlp(text)
    sents = [s.text.strip() for s in getattr(doc, "sents", []) if s.text.strip()]

    if not sents:
        sents = [x.strip() for x in re.split(r'(?<=[.!?])\s+(?=[A-Z"])', text) if x.strip()]

    sents = merge_inside_quotes(sents)

    return sents


def normalize_text(t: str) -> str:
    t = re.sub(r"[\"'`.,;:!?()\[\]{}<>]", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip().lower()


def appears_in_context_span(g_sent: str, context_sentences: List[str]) -> bool:
    """Check whether the phrase/entity appears literally (normalised) in the context."""
    g_norm = normalize_text(g_sent)
    return any(g_norm in normalize_text(c) for c in context_sentences)


def count_merge_errors(answer: str, merge_errors: List[str]) -> Tuple[int, List[str]]:
    """
    Count how many merge errors appear in the response.
    """
    hits = []
    answer_norm = answer.lower()
    for err in merge_errors or []:
        err_norm = err.lower()
        if re.search(rf"\b{re.escape(err_norm)}\b", answer_norm, flags=re.IGNORECASE):
            hits.append(err)
    return len(hits), hits



def evaluate_answer_accuracy(
    gemini_answer: str,
    context_sentences: List[str],
    merge_errors: List[str],
    golden_answer: str,
    penalty_error: float = 0.1,
    short_token_limit: int = 4,
) -> Dict[str, float]:

    # --- Step 1: Sentence split ---
    golden_sentences = sentence_split(golden_answer)
    gemini_sentences = sentence_split(gemini_answer)

    gemini_texts = gemini_sentences if gemini_sentences else [gemini_answer]
    golden_texts = golden_sentences if golden_sentences else []
    context_texts = context_sentences if context_sentences else []

    # --- Step 2: Batch encode ---
    to_encode = gemini_texts + golden_texts + context_texts
    all_embs = model.encode(to_encode, convert_to_tensor=True)

    g_len, gold_len = len(gemini_texts), len(golden_texts)

    gemini_embs = all_embs[:g_len]
    golden_embs = all_embs[g_len:g_len + gold_len] if gold_len > 0 else None
    context_embs = all_embs[g_len + gold_len:] if context_texts else None

    def score_against(target_texts: List[str], target_embs):
        if not target_texts or target_embs is None or len(target_embs) == 0:
            return {"median_cosine": 0.0, "median_fuzzy": 0.0,
                    "median_final": 0.0, "avg_median": 0.0}

        # cosine similarities in batch
        cos_matrix = util.cos_sim(gemini_embs, target_embs)

        all_embed, all_fuzzy = [], []
        norm_target_texts = [normalize_text(t) for t in target_texts]

        for i, g_sent in enumerate(gemini_texts):
            # best match embedding
            similarities = cos_matrix[i]
            best_idx = int(similarities.argmax())
            best_score_embed = float(similarities[best_idx])
            best_context = target_texts[best_idx]

            sim_embed_norm = (best_score_embed + 1) / 2
            sim_fuzzy = fuzz.token_sort_ratio(
                g_sent.lower(), best_context.lower()
            ) / 100.0

            # merge errors penality
            err_count, _ = count_merge_errors(g_sent, merge_errors)
            penalty_here = err_count * penalty_error * (1 + 0.5 * (sim_embed_norm + sim_fuzzy))

            # short literal match check
            tokens = [t for t in normalize_text(g_sent).split(" ") if t]
            is_short = len(tokens) <= short_token_limit
            literal_hit = any(normalize_text(g_sent) in t for t in norm_target_texts)

            if is_short and literal_hit:
                literal_score = max(0.0, 1.0 - penalty_here)
                all_embed.append(literal_score)
                all_fuzzy.append(literal_score)
            else:
                all_embed.append(max(0.0, sim_embed_norm - penalty_here))
                all_fuzzy.append(max(0.0, sim_fuzzy - penalty_here))

        if not all_embed and not all_fuzzy:
            return {"median_cosine": 0.0, "median_fuzzy": 0.0,
                    "median_final": 0.0, "avg_median": 0.0}

        # median
        median_embed = statistics.median(all_embed)
        median_fuzzy = statistics.median(all_fuzzy)
        avg_median = (median_embed + median_fuzzy) / 2.0
        median_final = statistics.median(all_embed + all_fuzzy)

        return {
            "median_cosine": median_embed,
            "median_fuzzy": median_fuzzy,
            "median_final": median_final,
            "avg_median": avg_median,
        }

    # --- Step 4: separate calculation and best choice ---
    scores_context = score_against(context_texts, context_embs)
    scores_golden = score_against(golden_texts, golden_embs)

    if scores_context["median_final"] >= scores_golden["median_final"]:
        best = scores_context
        best["source"] = "context"
    else:
        best = scores_golden
        best["source"] = "golden"

    best["final_score"] = best["median_final"]

    return {
        "median_cosine": best["median_cosine"],
        "median_fuzzy": best["median_fuzzy"],
        "median_final": best["median_final"],
        "avg_median": best["avg_median"],
        "final_score": best["final_score"],
        "source": best["source"],
    }

  from .autonotebook import tqdm as notebook_tqdm


KEY CONCEPT COVERAGE

In [None]:
from typing import Tuple, Dict, List
from rapidfuzz import fuzz
import spacy

# Carica modello spaCy (inglese o italiano)
nlp = spacy.load("en_core_web_sm")

def extract_key_tokens(key_concept: str) -> List[str]:
    """
    Extracts significant tokens from the generated key concept:
    Entity tokens (entities broken down into individual tokens) and other significant tokens (NOUN, PROPN, VERB, ADJ, PRON) not belonging to entities
    """
    doc = nlp(key_concept)
    
    entity_tokens = []
    entity_token_set = set()
    for ent in doc.ents:
        for tok in ent:
            if (tok.is_alpha or tok.like_num) and not tok.is_stop:
                tok_norm = normalize_text(tok.text)
                if tok_norm not in entity_token_set:
                    entity_tokens.append(tok.text)
                    entity_token_set.add(tok_norm)

    extra_tokens = []
    for tok in doc:
        tok_norm = normalize_text(tok.text)

        if tok.pos_ in {"NOUN", "PROPN", "ADJ"} and not tok.is_stop and tok.is_alpha:
            if tok_norm not in entity_token_set:
                extra_tokens.append(tok.text)
                entity_token_set.add(tok_norm)

        elif tok.pos_ == "VERB" and not tok.is_stop and tok.is_alpha:
            if tok.lemma_.lower() not in {"be", "have", "do"}:
                if tok_norm not in entity_token_set:
                    extra_tokens.append(tok.text)
                    entity_token_set.add(tok_norm)

    all_tokens = entity_tokens + extra_tokens

    return all_tokens


def evaluate_key_concept(answer: str, key_concept: str) -> float:
    """
    Evaluate the response by comparing significant tokens from the key_concept with the answer.
    """
    answer_norm = normalize_text(answer)
    key_tokens = extract_key_tokens(key_concept)
    
    scores: List[float] = []
    details: List[Dict] = []

    for token in key_tokens:
        token_norm = normalize_text(token)
        if token_norm in answer_norm:
            score = 1.0
            status = "exact match"
        else:
            score = 0.0
            status = "not found"
        scores.append(score)
        details.append({
            "token": token,
            "score": round(score, 3),
            "status": status
        })

    final_score = sum(scores) / len(scores) if scores else 0.0
    return round(final_score, 3)

COMPLETENESS

In [4]:
!pip install nltk




[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from sentence_transformers import SentenceTransformer, util
import spacy
import numpy as np
from rapidfuzz import fuzz
# Inizializza il modello
model = SentenceTransformer("sentence-transformers/paraphrase-mpnet-base-v2")

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
nltk.download("punkt_tab")  


def semantic_similarity(text1: str, text2: str) -> float:
    """
    Returns the similarity between two texts (0-1)    
    """
    emb1 = model.encode(text1, convert_to_tensor=True)
    emb2 = model.encode(text2, convert_to_tensor=True)
    score = util.cos_sim(emb1, emb2).item()
    return max(0.0, min(1.0, score))  

def fuzzy_entity_matching(golden_entities: List[str], response_entities: List[str], threshold: int = 80) -> int:
    """
    Count how many entities from the golden answer are present in the response approximately (similar words) using fuzzy matching.
    """
    matched = 0
    for g_ent in golden_entities:
        for r_ent in response_entities:
            if fuzz.ratio(g_ent.lower(), r_ent.lower()) >= threshold:
                matched += 1
                break  
    return matched


def extract_entities(text: str) -> List[str]:
    """Extracts entities from a text."""
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

def completeness_score(key_ideas: List[str], response: str, golden_answer: str) -> dict:
    """
    Completeness score optimised with batch embeddings:
    - Semantic score: median of the match between key ideas and the response
    - Entity score: fuzzy matching of entities
    - Final score: median between semantic_score and entity_score
    """
    response = "" if response is None else str(response)
    if not key_ideas:
        return {"semantic_score": 1.0, "entity_score": 1.0, "final_score": 1.0}

    response_sentences = sent_tokenize(response)

    # Step 1: semantic similarity (con batch)
    if response_sentences:
        resp_embs = model.encode(response_sentences, convert_to_tensor=True)
        key_embs = model.encode(key_ideas, convert_to_tensor=True)

        # Similarity metric: shape [len(key_ideas), len(response_sentences)]
        cos_matrix = util.cos_sim(key_embs, resp_embs)

        # Max for each key idea
        key_sims = cos_matrix.max(dim=1).values.cpu().numpy().tolist()
        semantic_score = float(np.median(key_sims))
    else:
        key_sims, semantic_score = [0.0] * len(key_ideas), 0.0

    # Step 2: fuzzy entity matching
    golden_entities = extract_entities(golden_answer)
    response_entities = extract_entities(response)
    if golden_entities:
        matched_count = fuzzy_entity_matching(golden_entities, response_entities, threshold=80)
        entity_score = matched_count / len(golden_entities)
    else:
        entity_score = 0.0

    # Step 3: final score = median between semantic_score e entity_score
    all_values = key_sims + [entity_score]
    final_median = float(np.median(all_values))
    final_mean = (semantic_score + entity_score) / 2.0

    return {
        "comp_semantic_score": round(semantic_score, 3),
        "comp_entity_score": round(entity_score, 3),
        "comp_final_median": round(final_median, 3),
        "comp_final_mean": round(final_mean, 3)
    }


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Elisa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Elisa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


CONCISENESS

In [None]:
def conciseness_score(response: str, answer: str) -> float:
    """
     Conciseness score:
    - 1.0 = same length as the golden answer
    - >1.0 = answer (gemini_answer) longer than the golden answer
    - <1.0 = answer shorter than the golden answer
    - 0.0 = empty answer
    """
    response = "" if response is None else str(response)
    answer = "" if answer is None else str(answer)

    L = len(response.split())  # gemini_answer length
    R = len(answer.split())    # golden_answer length

    if R == 0:
        return 1.0 if L == 0 else float("inf") 

    score = L / R
    return round(score, 3)


NUM_ADDITION & NUM_REPLICA_ERRORS

In [None]:
import re
import difflib
import unicodedata
from typing import List, Dict, Any
from difflib import SequenceMatcher

import numpy as np
import nltk
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline


nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)


def normalize_entity(ent: str) -> str:
    ent = ent.lower().strip()
    ent = re.sub(r"'s?$", "", ent)
    ent = re.sub(r"\s+", " ", ent)
    ent = ent.replace("’", "'")
    ent = re.sub(r"(?<=\w)\.(?=\w)", "", ent)
    ent = ent.replace(".", "")
    return ent

def normalize_text_mod(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    s = s.replace("\u00A0", " ")
    s = re.sub(r"[\u200B-\u200D\uFEFF]", "", s)
    s = (
        s.replace("’", "'")
        .replace("‘", "'")
        .replace("`", "'")
        .replace("“", '"')
        .replace("”", '"')
    )
    s = s.replace("–", "-").replace("—", "-")
    s = s.lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"\s+([\'\".,:;!?])", r"\1", s)
    return s.strip()


def _tokens_for_regex(phrase: str) -> List[str]:
    return re.findall(r"\w+", phrase.lower())

def _contains_phrase(text: str, phrase: str) -> bool:
    """
    Search for token sequence `phrase` in `text`
    """
    t = normalize_text_mod(text)
    p = normalize_text_mod(phrase)
    toks = _tokens_for_regex(p)
    if not toks:
        return False
    pattern = r"\b" + r"(?:\W+|_)*".join(map(re.escape, toks)) + r"\b"
    return re.search(pattern, t) is not None


# ----------------------------
# Entity coverage
# ----------------------------

def tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)


def extract_quoted_phrases(text: str) -> set:
    patterns = [r'"(.*?)"', r"“(.*?)”", r"``(.*?)''"]
    quotes = set()
    for pattern in patterns:
        for match in re.findall(pattern, text):
            cleaned = normalize_entity(match)
            if cleaned:
                quotes.add(cleaned)
    return quotes

def normalize_for_match(text: str) -> str:
    t = normalize_text_mod(text)  
    t = t.replace("-", " ")   
    t = re.sub(r"\s+", " ", t).strip()
    
    return t


def entity_coverage(gemini_answer: str, context: str, question: str) -> dict:
    matched: List[str] = []
    missing: List[str] = []
    answer_entities_tokens: set = set()
    answer_quotes: set = set()

    doc_answer = nlp(gemini_answer)
    relevant_labels = {
        "PERSON", "GPE", "ORG", "DATE", "QUANTITY",
        "CARDINAL", "TIME", "MONEY", "PERCENT", "WORK_OF_ART",
    }

    context_norm = normalize_for_match(context)
    context_words = {normalize_for_match(w) for w in tokenize(context_norm) if re.match(r"^\w+$", w)}
    context_phrases = {normalize_for_match(p) for p in extract_quoted_phrases(context_norm)}  
    context_all = context_words | context_phrases  

    question_all = {normalize_for_match(w) for w in tokenize(question) if re.match(r"^\w+$", w)}
    stop_words = set(stopwords.words("english"))

    answer_entities_tokens = set()
    for ent in doc_answer.ents:
        if ent.label_ in relevant_labels:
            tokens = [normalize_for_match(w) for w in tokenize(ent.text)]
            for t in tokens:
                if (
                    t not in stop_words
                    and re.match(r"^\w+$", t)
                ):
                    answer_entities_tokens.add(t)

    answer_quotes = {normalize_for_match(p) for p in extract_quoted_phrases(gemini_answer)}

    answer_entities = answer_entities_tokens | answer_quotes

    if not answer_entities:
        return {"score": -1.0, "matched": [], "missing": []}    
    
    for ent in answer_entities:
        if ent in context_all or ent in question_all or _contains_phrase(context, ent) or _contains_phrase(question, ent):
            matched.append(ent)
        else:
            missing.append(ent)

    frac = len(missing)
    return {
        "score": round(frac, 2),
        "matched": matched,
        "missing": missing,
    }


# ----------------------------
# Hallucination detection
# ----------------------------

nltk.download("stopwords")


def clean_differences(differences: list[str]) -> list[str]:
    cleaned = []
    for d in differences:
        if re.search(r"[A-Za-z0-9]", d):
            cleaned.append(d)
    return cleaned


def is_new_or_wrong(word: str, context_set: set) -> bool:
    """
    True if the word does NOT exist in the context (case-insensitive).
    if there is no exact match -> it is new/incorrect.
    """
    return word.lower() not in context_set


def get_differences_filtered(context, modified_text, question, language="english"):
    stop_words = set(stopwords.words(language))

    context_words = tokenize(context)
    print("context_words:", context_words)

    modified_words = tokenize(modified_text)
    question_words = {w.lower() for w in tokenize(question) if re.match(r"^\w+$", w)}

    matcher = SequenceMatcher(None, context_words, modified_words)
    result = []

    context_set = {w.lower() for w in context_words}

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag in ("replace", "insert"):
            diff_words = modified_words[j1:j2]

            for word in diff_words:
                w = word.lower()
                if (
                    w not in stop_words
                    and w not in question_words
                    and w not in context_set
                    and re.match(r"^\w+$", w)  
                    and is_new_or_wrong(w, context_set)  
                ):
                    result.append(word)

    return result

def find_modifications_and_score(text, modified_text, gemini_answer, question, language="english"):

    differences = get_differences_filtered(text, modified_text, question, language=language)
    differences = [normalize_entity(w) for w in differences]
    

    modified_text_tokens_all: List[str] = tokenize(modified_text)
    repeated_sequences = []
    for i in range(len(modified_text_tokens_all) - 1):
        if modified_text_tokens_all[i].lower() == modified_text_tokens_all[i + 1].lower():
            repeated_seq = f"{modified_text_tokens_all[i]} {modified_text_tokens_all[i+1]}"
            repeated_sequences.append(repeated_seq)

    differences = list(set(differences + repeated_sequences))

    differences = clean_differences(differences)
    
    if not differences:
        return {
            "Hallucination Score": -1.0,
            "random_word_in_gemini": []
        }

    stop_words = set(stopwords.words(language))


    gemini_tokens_nostop = [
    normalize_entity(w) for w in tokenize(gemini_answer)
    if w.lower() not in stop_words and re.match(r"^\w+$", w)
    ]
   
    gemini_answer_norm = normalize_text(gemini_answer)
    present = [diff for diff in differences 
            if diff in gemini_tokens_nostop or _contains_phrase(gemini_answer_norm, diff)]


    hallucination_score = len(present)
    
    return {
        "Hallucination Score": round(hallucination_score, 2),
        "random_word_in_gemini": present
    }

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Elisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re
import spacy
from typing import List, Dict, Any

nlp = spacy.load("en_core_web_sm")

def merge_lists(list1, list2):
    merged = set(list1 + list2)

    single_words = {w for w in merged if " " not in w and "'" not in w}
    phrases = {w for w in merged if w not in single_words}

    cleaned_phrases = set()
    for phrase in phrases:
        keep = True
        for word in single_words:
            if word in phrase:
                keep = False
                break
        if keep:
            cleaned_phrases.add(phrase)

    return list(single_words | cleaned_phrases)


# --- final function combining the two outputs ---
def analyze_results(entity_cov: Dict[str, Any], hallucination: Dict[str, Any]) -> Dict[str, Any]:
    missing = entity_cov.get("missing", [])
    randoms = hallucination.get("random_word_in_gemini", [])

    merged = merge_lists(missing, randoms)

    randoms_norm = {normalize_entity(r) for r in randoms}

    direct_modifications_list = []
    for item in merged:
        item_norm = normalize_entity(item)
        if " " in item:  # phrase
            words_in_item = [normalize_entity(w) for w in item.split()]
            if any(w in randoms_norm for w in words_in_item):
                direct_modifications_list.append(item)
        else:  # single word
            if item_norm in randoms_norm:
                direct_modifications_list.append(item)

    cleaned_merge = [m for m in merged if m not in direct_modifications_list]

    num_modifications = len(cleaned_merge)

    return {
        "Addition": cleaned_merge,
        "Num_addition": num_modifications,
        "Replicas": direct_modifications_list,
        "Num_replica_errors": len(direct_modifications_list)
    }


MAIN FUNCTION THAT CALLS THE OTHERS + LOADING OF THE VARIOUS FILES

In [None]:
# ----------------------------
# Cache embeddings
# ----------------------------
_embedding_cache: Dict[str, np.ndarray] = {}

def get_embedding(text: str):
    if not isinstance(text, str):
        text = str(text)  
    if text not in _embedding_cache:
        _embedding_cache[text] = model.encode(
            text, normalize_embeddings=True, convert_to_tensor=True
        )
    return _embedding_cache[text]

def score_response(reference_text: str,
                   question: str,
                   golden_answer: str,
                   gemini_answer: str,
                   key_ideas: List[str],
                   context_sentences: List[str],
                   key_concept: str,
                   modified_text: str) -> Dict[str, Any]:
    
    if not isinstance(gemini_answer, str):
        if gemini_answer is None or (isinstance(gemini_answer, float) and pd.isna(gemini_answer)):
            gemini_answer = ""   
        else:
            gemini_answer = str(gemini_answer)

    if not isinstance(reference_text, str):
        reference_text = str(reference_text or "")
    if not isinstance(question, str):
        question = str(question or "")
    if not isinstance(golden_answer, str):
        golden_answer = str(golden_answer or "")
    if not isinstance(modified_text, str):
        modified_text = str(modified_text or "")
        
    _embedding_cache.clear()

    # --- 1. Calcolo entity coverage ---
    entity_cov = entity_coverage(gemini_answer, reference_text, question)

    # --- 2. Calcolo hallucination detection ---
    hallucination = find_modifications_and_score(reference_text, modified_text, gemini_answer, question)

    # --- 3. Analisi combinata ---
    analysis = analyze_results(entity_cov, hallucination)
    print("Analysis:", analysis)

    # --- 4. Uso della lista merge come merge_errors ---
    accuracy_score = evaluate_answer_accuracy(
        gemini_answer,
        context_sentences,   
        analysis["Num_replica_errors"], 
        golden_answer,
        penalty_error=0.1,
        short_token_limit=4
    )

    completeness = completeness_score(key_ideas , gemini_answer, golden_answer)

    coverage_key_concept = evaluate_key_concept(gemini_answer , key_concept)

    conc = conciseness_score (gemini_answer , golden_answer)

    return {
        "acc_median_cosine": accuracy_score["median_cosine"],
        "acc_median_fuzzy": accuracy_score["median_fuzzy"],
        "acc_avg_median": accuracy_score["avg_median"],
        "acc_median_final": accuracy_score["median_final"], 
        "key_concept_coverage": coverage_key_concept,
        "comp_semantic_score": completeness.get("comp_semantic_score", 0.0),
        "comp_entity_score": completeness.get("comp_entity_score", 0.0),
        "comp_final_median": completeness.get("comp_final_median", 0.0),
        "comp_final_mean": completeness.get("comp_final_mean", 0.0),
        "Conciseness": conc,
        "Num_addition" : analysis["Num_addition"],
        "Num_replica_errors": analysis["Num_replica_errors"],
        "Addition": analysis["Addition"],
        "Replicas":analysis["Replicas"]
    }

In [10]:
!pip install pandas





[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import re
import json
import pandas as pd

# ---- FUNZIONE DI SUPPORTO ----
def load_data_from_jsonl(file_path):
    data_list = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            question = data.get("input", "")
            passages = data.get("passages", [])
            if passages:
                passage = passages[0]
                text = passage.get("text", "")
                sentences = passage.get("sentences", [])
            else:
                text, sentences = "", []
            output = data.get("output", [{}])[0]
            answer = output.get("answer", "")
            selected_sentences = output.get("selected_sentences", [])
            data_list.append({
                "question": question,
                "text": text,
                "sentences": sentences,
                "answer": answer,
                "selected_sentences": selected_sentences,
            })
    return data_list


clapnq_file = r"clapnq_train_answerable.jsonl"
data = load_data_from_jsonl(clapnq_file)

#FILE WITH THE EXTRACTED KEY CONCEPT
generated_file = r"generated_answers.csv"
generated_df = pd.read_csv(generated_file)

#FOLDER CONTAINING THE CSV FILES WITH THE ANSWERS
folder_path = r"GPT\GR"

for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        print(f"\n Start processing file: {file_name}")

        manip_df = pd.read_csv(file_path)
        all_scores = []

        for idx, row in manip_df.iterrows():
            test_id = row["test_id"]
            example = data[test_id]  
            key_concept = generated_df.iloc[test_id]["generated_answer"] if test_id < len(generated_df) else ""
            
            gemini_answer_clean = re.sub(r'^\s*(\d+\.\s*|\-\s+)', '', row["gemini_answer"], flags=re.MULTILINE)
            gemini_answer_clean = gemini_answer_clean.replace('*', '')

            scores = score_response(
                reference_text=example["text"],
                question=example["question"],
                golden_answer=example["answer"],
                gemini_answer=gemini_answer_clean,
                key_ideas=example["selected_sentences"],
                context_sentences=example["sentences"],
                key_concept=key_concept,
                modified_text=row["modified_text"]
            )
            
            all_scores.append(scores)
            print(f"Step {idx+1}/{len(manip_df)} done")

        scores_df = pd.DataFrame(all_scores)
        manip_df = pd.concat([manip_df, scores_df], axis=1)

        out_file = file_path.replace(".csv", "_NEW.csv")
        manip_df.to_csv(out_file, index=False)
        print(f"Save: {out_file}")


context_words: ['`', '`', 'Love', 'the', 'One', 'You', "'", 're', 'With', "'", "'", 'is', 'a', 'song', 'by', 'folk', 'rocker', 'Stephen', 'Stills', '.', 'It', 'was', 'released', 'as', 'the', 'lead', 'single', 'from', 'his', 'debut', 'self', '-', 'titled', 'studio', 'album', 'in', 'November', '1970', '.', 'The', 'song', ',', 'inspired', 'by', 'a', 'remark', 'Stills', 'heard', 'from', 'musician', 'Billy', 'Preston', ',', 'became', 'his', 'biggest', 'hit', 'single', ',', 'peaking', 'at', 'No', '.', '14', 'on', 'the', 'Billboard', 'Hot', '100', 'in', 'early', '1971', '.', 'David', 'Crosby', 'and', 'Graham', 'Nash', ',', 'Stills', "'", 'fellow', 'members', 'of', 'Crosby', ',', 'Stills', '&', 'Nash', ',', 'provide', 'background', 'vocals', 'on', 'the', 'song', '.', 'The', 'song', 'was', 'also', 'covered', 'by', 'a', 'number', 'of', 'artists', ',', 'including', 'The', 'Isley', 'Brothers', ',', 'Bucks', 'Fizz', ',', 'and', 'Luther', 'Vandross', '.']
Analysis: {'merge': [], 'num_modifications':