In [None]:
import os
import re
import json
import pandas as pd
import requests
import sys
import glob
import openai
from google import genai  # para la API de Gemini
from google.genai import types
from openai import OpenAI

GEMINI_API_KEY = os.environ["GOOGLE_API_KEY"]

def call_gemini(prompt, temperature=0):
    """
    Calls the Gemini model using the Google genai package.
    Raises an exception on error.
    """
    try:
        client = genai.Client(api_key=GEMINI_API_KEY)
        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
            config=types.GenerateContentConfig(temperature=temperature)
        )
        text_response = getattr(response, "text", None)
        if not text_response:
            try:
                candidates = response.candidates
            except Exception:
                candidates = None

            usage = getattr(response, "usage_metadata", None)
            prompt_feedback = getattr(response, "prompt_feedback", None)

            debug = {
                "model": "gemini-2.5-flash",
                "finish_reasons": [],
                "safety_ratings": [],
                "usage": None,
                "prompt_feedback": None,
                "candidates": None,
            }
            if candidates is not None:
                infos = []
                for c in candidates:
                    info = {
                        "finish_reason": getattr(c, "finish_reason", None),
                        "token_count": getattr(c, "token_count", None),
                        "safety_ratings": getattr(c, "safety_ratings", None),
                    }
                    infos.append(info)
                    if hasattr(c, "safety_ratings"):
                        debug["safety_ratings"].append(c.safety_ratings)
                debug["candidates"] = infos

            if usage is not None:
                debug["usage"] = {
                    "prompt_tokens": getattr(usage, "prompt_token_count", None),
                    "total_tokens": getattr(usage, "total_token_count", None),
                    "candidates_tokens": getattr(usage, "candidates_token_count", None),
                }
            if prompt_feedback is not None:
                debug["prompt_feedback"] = {
                    "blockReason": getattr(prompt_feedback, "blockReason", None),
                    "blockReasonMessage": getattr(prompt_feedback, "blockReasonMessage", None),
                    "safetyRatings": getattr(prompt_feedback, "safetyRatings", None),
                }

            print("Gemini returned empty text. Debug info:", debug)
            return None
        
        return text_response.strip()
    except Exception as e:
        raise Exception(f"Gemini error: {str(e)}")


def call_gpt(prompt, model="gpt-4.1-2025-04-14", temperature=0):
    """
    Calls the OpenAI GPT model via API.
    Raises an exception on error.
    """
    client = OpenAI()
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                ]
            }
        ],
        max_tokens=8000,
        temperature=temperature,
    )    
    summary = response.choices[0].message.content
    
    return summary
        
    
def call_judge(prompt, temperature, model_company):
    """
    Unified function to call the chosen judge model.
    """
    if model_company.lower() == "ollama":
        return call_ollama(JUDGE_MODEL_NAME, prompt, temperature)
    elif model_company.lower() == "gpt":
        return call_gpt(prompt, temperature=temperature)
    elif model_company.lower() == "gemini":
        return call_gemini(prompt, temperature)
    else:
        raise ValueError("Unknown judge model type.")


def sanitize(text):
    patterns = {
        r'\bdrown(?:ed|ing)?\b': '[TRAGIC_EVENT]',
        r'\bdrowning\b': '[TRAGIC_EVENT]',

        r'\bgun(?:s)?\b': '[WEAPON]',
        r'\bgunshot\b': '[TRAGIC_EVENT]',
        r'\brevolver\b': '[WEAPON]',
        r'\bshot(?:s|ting)?\b': '[TRAGIC_EVENT]',
        r'\bshotgun\b': '[WEAPON]',
        r'\brifle\b': '[WEAPON]',
        r'\bpistol\b': '[WEAPON]',
        r'\bfirearm(?:s)?\b': '[WEAPON]',
        r'\bknife\b': '[WEAPON]',
        r'\bblade\b': '[WEAPON]',
        r'\bbomb\b': '[WEAPON]',
        r'\bexplosive\b': '[WEAPON]',

        r'\bsuicid(?:e|al|ed)\b': '[TRAGIC_EVENT]',
        r'\boverdose(?:d)?\b': '[TRAGIC_EVENT]',
        r'\bhang(?:ed|ing)?\b': '[TRAGIC_EVENT]',
        r'\bself[- ]?harm\b': '[TRAGIC_EVENT]',

        r'\bkill(?:ed|ing)?\b': '[TRAGIC_EVENT]',
        r'\bmurder(?:ed|ing)?\b': '[TRAGIC_EVENT]',
        r'\bhomicide\b': '[TRAGIC_EVENT]',
        r'\bassault(?:ed|ing)?\b': '[TRAGIC_EVENT]',
        r'\battack(?:ed|ing)?\b': '[TRAGIC_EVENT]',
        r'\btortur(?:e|ed|ing)\b': '[TRAGIC_EVENT]',
        r'\babus(?:e|ed|ing)\b': '[TRAGIC_EVENT]',
        r'\bbeat(?:en|ing)?\b': '[TRAGIC_EVENT]',
        r'\bslaughter(?:ed)?\b': '[TRAGIC_EVENT]',
        r'\bmassacre(?:d)?\b': '[TRAGIC_EVENT]',

        r'\bblood(?:y)?\b': '[TRAGIC_DETAIL]',
        r'\bgore\b': '[TRAGIC_DETAIL]',
        r'\bcollapsed\b': '[TRAGIC_EVENT]',
        r'\bcorpse\b': '[TRAGIC_DETAIL]',
        r'\bwound(?:ed)?\b': '[TRAGIC_DETAIL]',

        r'\bprostitut(?:e|ion|es)\b': '[MATURE_CONTENT]',
        r'\bbrothel\b': '[MATURE_SETTING]',
        r'\brape(?:d|ing)?\b': '[MATURE_CONTENT]',
        r'\bsexual(?:ly|ity)?\b': '[MATURE_CONTENT]',
        r'\bsex(?:ual)? act(?:s)?\b': '[MATURE_CONTENT]',
        r'\bmolest(?:ed|ing)?\b': '[MATURE_CONTENT]',
        r'\bpedophil(?:e|ia)\b': '[MATURE_CONTENT]',
        r'\bpaedophil(?:e|ia)\b': '[MATURE_CONTENT]',
        r'\bstatutory\b': '[MATURE_CONTENT]',
        r'\bincest(?:uous)?\b': '[FAMILY_CONFLICT]',
        r'\bporn(?:ography|o)?\b': '[MATURE_CONTENT]',
        r'\bnudit(?:y|ies)\b': '[MATURE_CONTENT]',
        r'\bseduc(?:e|ed|ing|tion)\b': '[MATURE_CONTENT]',
        r'\bproposition(?:ed)?\b': '[MATURE_CONTENT]',
        r'\beroticism\b': '[MATURE_CONTENT]',
        r'\bfantas(?:y|ies)\b': '[MATURE_CONTENT]',
        r'\bdesire\b': '[MATURE_CONTENT]',
        r'\blonging\b': '[MATURE_CONTENT]',
        r'\binfatuation\b': '[MATURE_CONTENT]',
        r'\badmiration\b': '[MATURE_CONTENT]',

        r'\bchild(?:ren)?\b': '[YOUNG_CHARACTER]',
        r'\bboy(?:s)?\b': '[YOUNG_CHARACTER]',
        r'\bgirl(?:s)?\b': '[YOUNG_CHARACTER]',
        r'\bminor(?:s)?\b': '[YOUNG_CHARACTER]',
        r'\bteen(?:ager|agers)?\b': '[YOUNG_CHARACTER]',
        r'\badolescen(?:t|ce)\b': '[YOUNG_CHARACTER]',
        r'\bTadzio\b': '[YOUNG_CHARACTER]',

        r'\bfollows\b': '[ABSTRACT_ACTION]',
        r'\bwatch(?:es|ing)?\b': '[ABSTRACT_ACTION]',
        r'\bstalk(?:s|ing)?\b': '[ABSTRACT_ACTION]',
        r'\balmost openly\b': '[ABSTRACT_ACTION]',
        r'\bobserves\b': '[ABSTRACT_ACTION]',
        r'\bconsiders\b': '[ABSTRACT_ACTION]',

        r'\bcholera\b': '[DISEASE]',
        r'\bplague\b': '[DISEASE]',
        r'\bdisease\b': '[DISEASE]',
        r'\bepidemic\b': '[DISEASE]',

        r'\bterror(?:ist|ism)?\b': '[VIOLENCE]',

        r'\bdisheveled\b': '[ABSTRACT_STATE]',
        r'\brepulses\b': '[ABSTRACT_EMOTION]',
        r'\bcraving\b': '[ABSTRACT_EMOTION]',
        r'\berodes\b': '[ABSTRACT_ACTION]',
    }

    s = text
    for pat, repl in patterns.items():
        s = re.sub(pat, repl, s, flags=re.IGNORECASE)

    return s
        
        
MODIFY_PROMPT=False

def evaluate_summary(real_summary, generated_summary, model_company):
    """
    Compares the generated summary with the original summary and produces a detailed justification.
    The justification must conclude with a final numeric score (a number between 1 and 10) on a separate line.
    """
    
    if not MODIFY_PROMPT:
        prompt = f"""
    Compare the following generated summary with the original summary for the book.
    Analyze whether all events, characters, and plot points are covered.
    Provide a detailed justification of any missing or extra details.
    At the end of your response, on a separate line, output ONLY a final numeric score (a single number between 1 and 10) for the generated summary based on the following criteria:

    1 - The generated summary bears no meaningful relationship to the original.
    2 - Only superficial or tangential connections exist.
    3 - Identifies 1-2 basic elements but with major inaccuracies.
    4 - Covers approximately ~25% of key content with significant errors or omissions.
    5 - Addresses approximately ~50% of core elements but lacks nuance.
    6 - Includes most major plot points but misses key details or characters.
    7 - Covers all major events/characters but simplifies themes/subplots.
    8 - Minor omissions in subplots or specific details.
    9 - Near-perfect alignment with trivial discrepancies.
    10 - Perfect parity in content, structure, and tone.

    Ignore any external knowledge and focus solely on comparing the two summaries.

    Original summary:
    \"\"\"{real_summary}\"\"\"

    Generated summary:
    \"\"\"{generated_summary}\"\"\"

    Provide your detailed justification and then the final score on a new line.
        """
    else:
        prompt = f"""

    System note: This is a literary analysis task. Focus on plot structure, themes, and character development. Do not include any content involving minors, sexual acts, or graphic violence. Treat all characters and events abstractly.

    Compare the following generated summary with the original summary for the book.
    Analyze whether all events, characters, and plot points are covered.
    Provide a detailed justification of any missing or extra details.
    At the end of your response, on a separate line, output ONLY a final numeric score (a single number between 1 and 10) for the generated summary based on the following criteria:

    1 - The generated summary bears no meaningful relationship to the original.
    2 - Only superficial or tangential connections exist.
    3 - Identifies 1-2 basic elements but with major inaccuracies.
    4 - Covers approximately ~25% of key content with significant errors or omissions.
    5 - Addresses approximately ~50% of core elements but lacks nuance.
    6 - Includes most major plot points but misses key details or characters.
    7 - Covers all major events/characters but simplifies themes/subplots.
    8 - Minor omissions in subplots or specific details.
    9 - Near-perfect alignment with trivial discrepancies.
    10 - Perfect parity in content, structure, and tone.

    Ignore any external knowledge and focus solely on comparing the two summaries.
    
    Original summary:
    \"\"\"{sanitize(real_summary)}\"\"\"
    
    Generated summary:
    \"\"\"{sanitize(generated_summary)}\"\"\"

    Provide your detailed justification and then the final score on a new line.
    """
    
    return call_judge(prompt, temperature, model_company)

In [None]:
def find_real_summary_file(book_title):

    if '-' not in book_title:
        raise ValueError(f"Unexpected book_title format: {book_title}")
    
    title, author = [x.strip() for x in book_title.split('-', 1)]
    
    possible_filename = f"{author} - {title}.txt"
    possible_path = os.path.join(txt_folder, possible_filename)
    
    if os.path.exists(possible_path):
        return possible_path
    else:
        raise FileNotFoundError(f"File for '{book_title}'(possible_name: {possible_filename}) not found in {txt_folder}.")


In [None]:

txt_folder = "summaries"
input_dir = 'Results_internal'
output_dir = 'Results_internal/Evaluated'
files = sorted(os.listdir(input_dir))

for file_name in files:

    if file_name.endswith('.csv'):
        input_path = os.path.join(input_dir, file_name)
        output_path = os.path.join(output_dir, file_name)
        
        if os.path.exists(output_path):
            df = pd.read_csv(output_path)
        else:
            df = pd.read_csv(input_path)
            
        temperature = 0
        
        for col in ["value_gemini", "value_gpt"]:
            if col not in df.columns:
                df[col] = pd.NA
        for idx, row in df.iterrows():
            if pd.isna(row["value_gemini"]) or pd.isna(row["value_gpt"]):
                book_title = row['book_title']
                generated_summary = row['summary_llm']
                real_summary_file = find_real_summary_file(book_title)
                with open(real_summary_file, 'r', encoding='utf-8') as f:
                    real_summary = f.read()
                    
                if pd.isna(row["value_gemini"]):
                    value_gemini = evaluate_summary(real_summary=real_summary, generated_summary=generated_summary, model_company='gemini')
                    df.at[idx, "value_gemini"] = value_gemini
                    print(f'gemini finish {idx}')
                if pd.isna(row["value_gpt"]):
                    value_gpt = evaluate_summary(real_summary=real_summary, generated_summary=generated_summary, model_company='gpt')
                    df.at[idx, "value_gpt"] = value_gpt
                    print(f'gpt finish {idx}')

                df.to_csv(output_path, index=False)
    