In [None]:
import os
import csv
import pickle
from typing import List, Tuple

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support

In [None]:
try:
    from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
except Exception:
    corpus_bleu = None  # type: ignore
    SmoothingFunction = None  # type: ignore

In [None]:
try:
    from rouge_score import rouge_scorer
except Exception:
    rouge_scorer = None  # type: ignore

In [None]:
def read_conversation_csv(csv_path: str) -> Tuple[List[str], List[str]]:
    questions: List[str] = []
    answers: List[str] = []
    with open(csv_path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            q = (row.get('question') or '').strip()
            a = (row.get('answer') or '').strip()
            if not q or not a:
                continue
            questions.append(q)
            answers.append(a)
    if not questions:
        raise ValueError("No question/answer rows found in Conversation.csv")
    return questions, answers

In [None]:
def load_artifacts(base_dir: str):
    with open(os.path.join(base_dir, "tfidf_vectorizer.pkl"), "rb") as f:
        vectorizer = pickle.load(f)
    with open(os.path.join(base_dir, "qa_matrix.pkl"), "rb") as f:
        qa_matrix = pickle.load(f)
    with open(os.path.join(base_dir, "qa_answers.pkl"), "rb") as f:
        answers = pickle.load(f)
    return vectorizer, qa_matrix, answers

In [None]:
def evaluate_classification(
    questions: List[str],
    gold_answers: List[str],
    vectorizer,
    qa_matrix,
    index_to_answer: List[str],
    threshold: float = 0.35,
):
    y_true: List[int] = []
    y_pred: List[int] = []

    # Gold answer to index mapping (first occurrence wins)
    answer_to_index = {}
    for idx, ans in enumerate(index_to_answer):
        answer_to_index.setdefault(ans, idx)

    for q, gold in zip(questions, gold_answers):
        user_vec = vectorizer.transform([q])
        sims = cosine_similarity(user_vec, qa_matrix).flatten()
        best_idx = int(np.argmax(sims))
        best_score = float(sims[best_idx])
        pred_idx = best_idx if best_score >= threshold else -1

        true_idx = answer_to_index.get(gold, -1)
        y_true.append(true_idx)
        y_pred.append(pred_idx)

    # Convert to binary correctness for strict exact-match on answer text
    y_true_bin = [1 if t != -1 else 0 for t in y_true]
    y_pred_bin = [1 if (p != -1 and index_to_answer[p] == gold_answers[i]) else 0 for i, p in enumerate(y_pred)]

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true_bin, y_pred_bin, average='binary', zero_division=0
    )
    return {
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "support": int(sum(y_true_bin)),
    }

In [None]:
def evaluate_generation(
    questions: List[str],
    gold_answers: List[str],
    vectorizer,
    qa_matrix,
    index_to_answer: List[str],
):
    # Build predictions (top-1)
    preds: List[str] = []
    for q in questions:
        sims = cosine_similarity(vectorizer.transform([q]), qa_matrix).flatten()
        best_idx = int(np.argmax(sims))
        preds.append(index_to_answer[best_idx])

    results = {}

    # BLEU (requires nltk)
    if corpus_bleu is not None:
        smoothie = SmoothingFunction().method3 if SmoothingFunction else None
        # corpus_bleu expects references as list of list of tokens; hypotheses as list of tokens
        references = [[g.split()] for g in gold_answers]
        hypotheses = [p.split() for p in preds]
        try:
            bleu = corpus_bleu(references, hypotheses, smoothing_function=smoothie)
            results["bleu"] = float(bleu)
        except Exception:
            results["bleu"] = None
    else:
        results["bleu"] = None

    # ROUGE (requires rouge-score)
    if rouge_scorer is not None:
        scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
        r1_f = []
        r2_f = []
        rl_f = []
        for ref, hyp in zip(gold_answers, preds):
            scores = scorer.score(ref, hyp)
            r1_f.append(scores["rouge1"].fmeasure)
            r2_f.append(scores["rouge2"].fmeasure)
            rl_f.append(scores["rougeL"].fmeasure)
        results["rouge1_f"] = float(np.mean(r1_f)) if r1_f else None
        results["rouge2_f"] = float(np.mean(r2_f)) if r2_f else None
        results["rougeL_f"] = float(np.mean(rl_f)) if rl_f else None
    else:
        results["rouge1_f"] = None
        results["rouge2_f"] = None
        results["rougeL_f"] = None

    return results

In [None]:
def main():
    base_dir = os.path.dirname(__file__)
    csv_path = os.path.join(base_dir, "Conversation.csv")
    questions, gold_answers = read_conversation_csv(csv_path)

    vectorizer, qa_matrix, answers = load_artifacts(base_dir)

    cls_metrics = evaluate_classification(questions, gold_answers, vectorizer, qa_matrix, answers)
    gen_metrics = evaluate_generation(questions, gold_answers, vectorizer, qa_matrix, answers)

    print("Classification (exact-match on answer text):")
    print(cls_metrics)
    print("\nGeneration metrics:")
    print(gen_metrics)

In [None]:
if __name__ == "__main__":
    main()