In [None]:
!pip install google-generativeai pandas numpy scikit-learn rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e33b09bc44f3c3a19868f842f0c8dc9ceeb3ec007f9ada25885366e4ecc8f1be
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.23.1-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.23.1-py3-none-any.whl (127 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.4/127.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.23.1


In [None]:
import json
import os
import re
from typing import List, Dict, Tuple
import pandas as pd
from rouge_score import rouge_scorer
import time
from groq import Groq
from tabulate import tabulate
from google.colab import userdata

# --- Groq API Configuration ---
GROQ_MODEL = "llama3-70b-8192"
try:
    GROQ_API_KEY = ""
    if not GROQ_API_KEY:
        raise ValueError("GROQ_API_KEY not set in Colab Secrets or environment.")
    client = Groq(api_key=GROQ_API_KEY)
except Exception as e:
    print(f"Error configuring Groq API: {e}")
    raise

# --- Prompt Templates ---
NUMERIC_SCORING_PROMPT = """
You are an impartial AI grader. Your task is to evaluate Answer B relative to Answer A (the reference answer) based on the following criterion.

**Criterion:** {criterion_name}: {criterion_description}

**Question:**
{question}

**Answer A (Reference):**
{answer_a}

**Answer B (System Response):**
{answer_b}

**Instructions:**
1. Analyze both answers step-by-step, comparing how well Answer B satisfies the criterion relative to Answer A.
2. Assign a score to Answer B from 0 (poor) to 10 (excellent).
3. Return *only* a valid JSON object with 'reasoning' and 'score' fields, with no additional text or markdown.
4. Ensure the output is valid JSON, e.g.:
{{
  "reasoning": "Your step-by-step analysis here",
  "score": 7
}}

**Output:**
{{
  "reasoning": "Your step-by-step analysis here",
  "score": <number>
}}
"""

# --- Criterion Definitions ---
CRITERIA = {
    "comprehensiveness": "How much detail does the answer provide to cover all aspects and details of the question?",
    "diversity": "How varied is the answer in perspectives or examples?",
    "empowerment": "Does the answer help the user understand or act confidently?",
    "directness": "Is the answer concise and directly answering the question?"
}

# --- Groq API Call ---
def call_groq(prompt: str, retries: int = 3) -> str:
    """Calls Groq API with retry logic and rate limiting."""
    for attempt in range(retries):
        try:
            completion = client.chat.completions.create(
                model=GROQ_MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=1024
            )
            time.sleep(10)  # Rate limit: 6 RPM = 10 seconds per request
            return completion.choices[0].message.content
        except Exception as e:
            print(f"Groq API error on attempt {attempt + 1}: {e}")
            if attempt == retries - 1:
                return '{"reasoning": "Error: Failed to get response from Groq.", "score": -1}'
            time.sleep(1)  # Wait before retrying
    time.sleep(10)  # Rate limit even on failure
    return '{"reasoning": "Error: Failed to get response from Groq.", "score": -1}'

# --- Score Parser ---
def parse_score(response: str) -> Tuple[float, str]:
    """Parses JSON response to extract score and reasoning, with fallback parsing."""
    # Try to extract JSON block using regex
    json_match = re.search(r'\{[\s\S]*?\}', response, re.MULTILINE)
    if json_match:
        json_str = json_match.group(0)
        try:
            data = json.loads(json_str)
            score = float(data.get("score", -1))
            reasoning = data.get("reasoning", "No reasoning provided")
            if not (0 <= score <= 10):
                return -1.0, "Invalid score: Not in range 0–10"
            return score, reasoning
        except json.JSONDecodeError:
            print(f"Invalid JSON in response: {json_str}")
    else:
        print(f"No JSON block found in response: {response}")

    # Fallback: Try to extract score from narrative text
    score_match = re.search(r"(?:score of|assigned a score of)\s*(\d{1,2}(?:\.\d+)?)", response, re.IGNORECASE)
    if score_match:
        try:
            score = float(score_match.group(1))
            if 0 <= score <= 10:
                return score, "Fallback: Score extracted from narrative text"
        except ValueError:
            pass

    # Log full response for debugging
    with open("parsing_errors.log", "a") as f:
        f.write(f"Invalid response: {response}\n\n")
    return -1.0, "Error: Failed to parse JSON or extract score"

# --- ROUGE Score Calculator ---
def calculate_rouge(answer_a: str, answer_b: str) -> Dict[str, float]:
    """Calculates ROUGE-L score between two answers."""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(answer_a, answer_b)
    return {"rougeL": scores['rougeL'].fmeasure}

# --- Answer Validation ---
def validate_jsons(original_data: Dict, compare_data: Dict) -> List[Tuple[Dict, Dict]]:
    """Validates JSON data and matches question-PDF pairs from PDF-keyed structure."""
    required_fields = {"question", "answer"}
    pairs = []

    # Get common PDF names
    common_pdfs = set(original_data.keys()) & set(compare_data.keys())
    if not common_pdfs:
        print("Error: No common PDF names found.")
        return []

    for pdf in common_pdfs:
        original_entries = original_data[pdf]
        compare_entries = compare_data[pdf]

        # Create question-to-entry mappings
        original_map = {entry["question"]: entry for entry in original_entries}
        compare_map = {entry["question"]: entry for entry in compare_entries}

        # Validate fields
        for entry, name in [(original_entries, "original"), (compare_entries, "AURA")]:
            for item in entry:
                if not all(field in item for field in required_fields):
                    print(f"Error: Missing fields in {name} JSON for PDF {pdf}: {item}")
                    return []

        # Find common questions
        common_questions = set(original_map.keys()) & set(compare_map.keys())
        if not common_questions:
            print(f"Error: No matching questions found for PDF {pdf}.")
            continue

        # Create pairs with PDF context
        for question in common_questions:
            original_entry = original_map[question].copy()
            compare_entry = compare_map[question].copy()
            original_entry["pdf"] = pdf  # Add pdf field for downstream compatibility
            compare_entry["pdf"] = pdf
            pairs.append((original_entry, compare_entry))

    if not pairs:
        print("Error: No matching question-PDF pairs found.")
        return []
    return pairs
# --- Evaluation Runner ---
def evaluate_answers_with_scores(original_json_path: str, compare_json_path: str) -> Dict:
    """Evaluates answers from two JSON files with numerical scoring and ROUGE."""
    # Load JSON files
    try:
        with open(original_json_path, 'r') as f:
            original_data = json.load(f)
        with open(compare_json_path, 'r') as f:
            compare_data = json.load(f)
    except FileNotFoundError as e:
        print(f"Error: JSON file not found: {e}")
        return {}
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return {}

    # Validate and match pairs
    pairs = validate_jsons(original_data, compare_data)
    if not pairs:
        return {}
    print(f"Found {len(pairs)} matching question-PDF pairs for evaluation.")

    rows = []
    detailed_results = []
    parse_errors = 0

    for i, (original_entry, compare_entry) in enumerate(pairs, 1):
        question = original_entry["question"]
        answer_a = original_entry["answer"]
        answer_b = compare_entry["answer"]
        pdf = original_entry["pdf"]
        print(f"Processing question {i}/{len(pairs)}: {question[:50]}...")

        row = {"question": question, "pdf": pdf}
        detailed_entry = {
            "question": question,
            "pdf": pdf,
            "answer_a": answer_a,
            "answer_b": answer_b,
            "scores": {}
        }

        # LLM-as-a-Judge Scoring
        for criterion in CRITERIA:
            prompt = NUMERIC_SCORING_PROMPT.format(
                question=question,
                criterion_name=criterion.capitalize(),
                criterion_description=CRITERIA[criterion],
                answer_a=answer_a,
                answer_b=answer_b
            )
            response = call_groq(prompt)
            score, reasoning = parse_score(response)
            row[f"{criterion}_score"] = score
            detailed_entry["scores"][criterion] = {"score": score, "reasoning": reasoning}
            if score == -1.0:
                parse_errors += 1

        # ROUGE Score
        rouge_scores = calculate_rouge(answer_a, answer_b)
        row["rougeL"] = rouge_scores["rougeL"]
        detailed_entry["rougeL"] = rouge_scores["rougeL"]

        rows.append(row)
        detailed_results.append(detailed_entry)

    # Create DataFrame
    df = pd.DataFrame(rows)
    df.to_csv("evaluation_numeric_scores.csv", index=False)

    # Save Detailed Results
    with open("evaluation_detailed_results.json", "w") as f:
        json.dump(detailed_results, f, indent=2)

    # Summary
    summary = {f"avg_{c}_score": df[f"{c}_score"].mean() for c in CRITERIA}
    summary["avg_rougeL"] = df["rougeL"].mean()

    # Create Formatted Table
    table_data = [
        [k.replace("_score", "").capitalize(), f"{v:.2f}" + ("/10" if "rougeL" not in k else "")]
        for k, v in summary.items()
    ]
    print("\n--- Numeric Scoring Summary ---")
    print(tabulate(table_data, headers=["Metric", "Average Score"], tablefmt="grid"))
    if parse_errors > 0:
        print(f"\nWarning: {parse_errors} responses failed to parse correctly. See 'parsing_errors.log' for details.")

    return summary

# --- Run Evaluation ---
if __name__ == "__main__":
    original_json_path = "/content/question_answers.json"
    compare_json_path = "/content/generated_rag_answers (2).json"
    results = evaluate_answers_with_scores(original_json_path, compare_json_path)

Found 7 matching question-PDF pairs for evaluation.
Processing question 1/7: What mechanisms does BOT-anist use to ensure respo...
Processing question 2/7: How does BOT-anist handle cross-platform deploymen...
Processing question 3/7: What techniques are used to adapt the user interfa...
Processing question 4/7: In what ways does BOT-anist utilize blend shapes f...
Processing question 5/7: How does the JointPinSystem and JointPinComponent ...
Processing question 6/7: Explain how BOT-anist differentiates its window ma...
Processing question 7/7: How does BOT-anist implement dynamic lighting and ...

--- Numeric Scoring Summary ---
+-----------------------+-----------------+
| Metric                | Average Score   |
| Avg_comprehensiveness | 2.29/10         |
+-----------------------+-----------------+
| Avg_diversity         | 1.71/10         |
+-----------------------+-----------------+
| Avg_empowerment       | 2.57/10         |
+-----------------------+-----------------+
| Avg_dire