### Fuzzy matching bei chunk ranking -> wurde geändert zu Cosine similarity (semantisch und nichtmehr stringbasiert)

In [None]:
# 1) Load all evaluation questions
with open("question-sets/q_v2.json", "r", encoding="utf-8") as f:
    all_questions = json.load(f)
print(f"Total questions: {len(all_questions)}")

# 2) Split and filter context into chunks
context_chunks = context.split("\n\n")
print(f"Total chunks: {len(context_chunks)}")

min_words = 20
filtered_chunks = [ch for ch in context_chunks if len(ch.split()) >= min_words]
print(f"Chunks ≥ {min_words} words: {len(filtered_chunks)}")

# 3) Compute aggregate relevance score per chunk
chunk_scores = []
for idx, chunk in enumerate(filtered_chunks):
    # Sum relevance scores across all questions
    total_score = 0
    for q in all_questions:

        # vorversuch: String-basiertes Chunk-ranking: total_score += fuzz.partial_ratio(q["question"].lower(), chunk.lower())
    
    # Optionally, average = total_score / len(all_questions)
    chunk_scores.append((idx, total_score, chunk))

# 4) Sort chunks by descending total_score
chunk_scores.sort(key=lambda x: x[1], reverse=True)

# 5) Select top_k most relevant chunks overall
top_k = 5
top_chunks = chunk_scores[:top_k]
print(f"Selected top {top_k} chunks based on aggregate relevance:")
for rank, (idx, score, _) in enumerate(top_chunks, 1):
    print(f"  {rank}. Chunk #{idx} — Total Score: {score}")

# 6) Build the reduced context and save
reduced_context = "\n\n".join(chunk for (_, _, chunk) in top_chunks)
print(f"\nReduced context char length: {len(reduced_context)}")

with open("reduced_context.txt", "w", encoding="utf-8") as f:
    f.write(reduced_context)

print(f"Context length reduced by {(1 - len(reduced_context) / len(context)) * 100:.2f}%")
print("Reduced context saved to 'reduced_context.txt'")


### Ursprüngliche eval. methode war stringbasiert/fuzzy matching -> wurde geändert zu semantischer methode (dauert etwas länger aber deutlich sinnvoller, kein manueller Zusatzaufwand, skalierbar)

In [None]:
from thefuzz import fuzz

def evaluate_qa(data: list,
                      context: str,
                      nlp_callable,
                      threshold: int) -> float:
    correct = 0  # initialize correct count
    total = len(data)  # total number of questions

    for item in data:
        question = item["question"] 
        expected = item["answer"]
        QA_input = {"question": question, "context": context}  # prepare model input

        result = nlp_callable(QA_input)  # run QA pipeline
        pred = result["answer"]  # extract predicted answer
        sim = fuzz.ratio(pred.lower(), expected.lower())  # compute similarity

        status = "✅" if sim >= threshold else "❌"  # check against threshold
        print(f"{status} Question: {question}")
        print(f"   Expected: {expected}         Received: {pred}\n")
        #print(f"   Similarity: {sim}%\n")

        if sim >= threshold:
            correct += 1  # increment if prediction is close enough

    accuracy = (correct / total) * 100 if total else 0  # calculate accuracy
    print(f"\n✅ Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
    return accuracy  # return the accuracy percentage
