In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
import csv
import time
from typing import Dict

load_dotenv()  # Loads variables from .env into environment

api_key = os.getenv("API_KEY")

#os.environ.get("OPENAI_API_KEY")
client = OpenAI(
    api_key=api_key,  # This is the default and can be omitted
)

### Response format

In [None]:
from pydantic import BaseModel

class TruePositive(BaseModel):
    A_fact: str
    B_fact: str


class FalsePositive(BaseModel):
    A_fact: str


class FalseNegative(BaseModel):
    B_fact: str


class SummaryCounts(BaseModel):
    TP: int
    FP: int
    FN: int


class ValidationResponse(BaseModel):
    true_positives: list[TruePositive]
    false_positives: list[FalsePositive]
    false_negatives: list[FalseNegative]
    summary_counts: SummaryCounts


In [None]:
def GetLLMValidation(instruction_prompt, summary_A, summary_B, print_prompt=False):
    full_prompt = f"{instruction_prompt}\nSummary A:\n{summary_A}\nSummary B:\n{summary_B}"
    if print_prompt:
        print(full_prompt)

    # Step 2: Generate summary based on the transcript using the guidelines
    validation_response = client.beta.chat.completions.parse(
        model="o3-mini",
        messages=[
            {"role": "user", "content": full_prompt}
        ],
        #temperature=0.0000001, #o3-mini doesn't allow this value
        #top_p=0.0000001, #also not allowed
        seed=1234,
        reasoning_effort="high",
        response_format=ValidationResponse,  # Ensures JSON output
        #tool_choice="auto",  # Uses JSON schema
    )
    return validation_response.choices[0].message.parsed


def ReadFile(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            txt = file.read().strip()
            return txt
    else:
        raise FileNotFoundError(f"Error: {file_path} not found!")
    


def SaveValidation(result, folder_path, doctor, patient, prompt_number, model):
    doctor_str = f"{doctor:02}"
    patients_str = f"{patient:02}"
    output_file_name = f"arst_{doctor_str}_patsient_{patients_str}_võrdlus_prompt_{prompt_number}_{model}.json"
    output_file_path = os.path.join(folder_path, output_file_name)

    # Convert result to string if it's not already a string
    if isinstance(result, str):
        result_str = result
    else:
        # Assuming the result can be serialized to JSON
        try:
            result_str = json.dumps(result, ensure_ascii=False, indent=4)  # Serialize to JSON string
        except TypeError:
            # If it cannot be serialized to JSON, fall back to its string representation
            result_str = str(result)  # Convert the object to its string representation

    # Check if result is JSON format
    try:
        # Try parsing the string to check if it's valid JSON
        result_json = json.loads(result_str)
        
        # Save the parsed result as a JSON file
        with open(output_file_path, 'w', encoding='utf-8') as file:
            json.dump(result_json, file, ensure_ascii=False, indent=4)
        print(f"Validation saved to {output_file_path}")

    except json.JSONDecodeError:
        # If result is not in JSON format, save it as a plain text file
        output_file_name = f"arst_{doctor_str}_patsient_{patients_str}_võrdlus_prompt_{prompt_number}_{model}.txt"
        output_file_path = os.path.join(folder_path, output_file_name)
        
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(result_str)  # Write the string representation of the result
        
        print(f"Validation saved to {output_file_path}")

def CreateDirectory(path, folder_name):
    """
    Creates a directory with the given folder_name in the specified path.

    Parameters:
    path (str): The base path where the directory should be created (can be relative).
    folder_name (str): The name of the directory to create.
    """
    # Construct the full directory path
    full_path = os.path.join(path, folder_name)
    # Create the directory
    try:
        os.makedirs(full_path, exist_ok=True)
        print(f"Directory '{full_path}' created successfully.")
        return full_path
    except PermissionError:
        print(f"Permission denied: Unable to create '{full_path}'.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Function to calculate precision, recall, and F1-score
def calculate_metrics(tp: int, fp: int, fn: int) -> Dict[str, float]:
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    return {"Precision": precision, "Recall": recall, "F1-score": f1_score}

def average_metrics(metrics1: Dict[str, float], metrics2: Dict[str, float]) -> Dict[str, float]:
    return {
        "Precision": (metrics1["Precision"] + metrics2["Precision"]) / 2,
        "Recall": (metrics1["Recall"] + metrics2["Recall"]) / 2,
        "F1-score": (metrics1["F1-score"] + metrics2["F1-score"]) / 2,
    }

### Compare a specific summary

In [None]:

input_prompt = ReadFile("LLM_as_judge_prompt_5.txt")
model = "o3-mini-high"
doctor_index = 1
patient_index = 1
prompt_index = 1

generated = ReadFile(f"summaries/arst_{doctor_index}_patsient_{patient_index}/kokkuvõtted/prompt_{prompt_index}/arst_{doctor_index:02}_patsient_{patient_index:02}_kokkuvõte_prompt_{prompt}_a.txt")
original = ReadFile(f"Arst_{doctor_index:03}/Patsient_{patient_index:03}/toorfailid/arsti_kokkuvote_orig_{doctor_index:02}_{patient_index:02}.txt")
print(f"Processing doctor: {doctor_index}, patient: {patient_index}.")
response = GetLLMValidation(input_prompt, generated, original, print_prompt=False)
print(response)


print(response.summary_counts.TP)
print(response.summary_counts.FP)
print(response.summary_counts.FN)




## Run validation one way

In [None]:
# Function to process LLM response and append results to CSV
def process_and_save_results(doctor_index: int, patient_index: int, prompt_index: int, response, csv_filename="o3-mini-high-validations.csv"):
    # Extract TP, FP, FN from response
    tp = response.summary_counts.TP
    fp = response.summary_counts.FP
    fn = response.summary_counts.FN
    # Calculate precision, recall, and F1-score
    metrics = calculate_metrics(tp, fp, fn)

    # Format results for CSV
    row = [doctor_index, patient_index, 
           round(metrics["Precision"], 3) if prompt_index == 1 else "", 
           round(metrics["Recall"], 3) if prompt_index == 1 else "", 
           round(metrics["F1-score"], 3) if prompt_index == 1 else "", 
           round(metrics["Precision"], 3) if prompt_index == 2 else "", 
           round(metrics["Recall"], 3) if prompt_index == 2 else "", 
           round(metrics["F1-score"], 3) if prompt_index == 2 else "", 
           round(metrics["Precision"], 3) if prompt_index == 3 else "", 
           round(metrics["Recall"], 3) if prompt_index == 3 else "", 
           round(metrics["F1-score"], 3) if prompt_index == 3 else "", 
           round(metrics["Precision"], 3) if prompt_index == 4 else "", 
           round(metrics["Recall"], 3) if prompt_index == 4 else "", 
           round(metrics["F1-score"], 3) if prompt_index == 4 else "", 
           round(metrics["Precision"], 3) if prompt_index == 5 else "", 
           round(metrics["Recall"], 3) if prompt_index == 5 else "", 
           round(metrics["F1-score"], 3) if prompt_index == 5 else ""]
    
    # Append results to CSV file
    with open(csv_filename, mode="a", newline="") as file:
        writer = csv.writer(file, delimiter="\t")  # Use tab separator
        writer.writerow(row)

# Function to run evaluation for all doctors, patients, and prompts
def run_evaluation(csv_filename="o3-mini-high-validations.csv",):
    input_prompt = ReadFile("LLM_as_judge_prompt_4.txt")
    # Write the header only once at the start
    with open(csv_filename, mode="w", newline="") as file:
        writer = csv.writer(file, delimiter="\t")
        writer.writerow(["Doctor", "Patient", "Precision (Prompt 1)", "Recall (Prompt 1)", "F1-score (Prompt 1)", 
                         "Precision (Prompt 2)", "Recall (Prompt 2)", "F1-score (Prompt 2)", 
                         "Precision (Prompt 3)", "Recall (Prompt 3)", "F1-score (Prompt 3)",
                         "Precision (Prompt 4)", "Recall (Prompt 4)", "F1-score (Prompt 4)",
                         "Precision (Prompt 5)", "Recall (Prompt 5)", "F1-score (Prompt 5)"])

    # Loop over all prompts, doctors, and patients
    for prompt_index in range(1, 6):  # Prompts 1 to 5
        print(f"Processing Prompt {prompt_index}...")

        for doctor_index in range(1, 11):  # Doctors 1 to 10
            for patient_index in range(1, 11):  # Patients 1 to 10
                start = time.time()
                print(f"Evaluating Doctor {doctor_index}, Patient {patient_index}, Prompt {prompt_index}")

                ai_summary_path = f"summaries/arst_{doctor_index}_patsient_{patient_index}/kokkuvõtted/prompt_{prompt_index}/arst_{doctor_index:02}_patsient_{patient_index:02}_kokkuvõte_prompt_{prompt_index}_a.txt"

                # Load AI and doctor summaries (replace with actual file reading logic)
                doctor_summary_path = f"Arst_{doctor_index:03}/Patsient_{patient_index:03}/toorfailid/arsti_kokkuvote_orig_{doctor_index:02}_{patient_index:02}.txt"
                
                ai_summary = ReadFile(ai_summary_path)
                doctor_summary = ReadFile(doctor_summary_path)

                # Get validation response from LLM
                response = GetLLMValidation(input_prompt, ai_summary, doctor_summary, print_prompt=False)

                if response:
                    new_path = f"arst_{doctor_index:02}_patsient_{patient_index:02}"
                    save_path = CreateDirectory("validations",new_path)
                    SaveValidation(response,save_path, doctor_index, patient_index, prompt_index, "o3-mini-high")
                    process_and_save_results(doctor_index, patient_index, prompt_index, response, csv_filename)
                    end = time.time()
                    print(f"Time elapsed: {end-start}")

# Run the evaluation for all doctors, patients, and prompts
run_evaluation()

## Running validation 2 times, both ways

In [None]:
# Function to process LLM response and append results to CSV
def process_and_save_results(doctor_index: int, patient_index: int, prompt_index: int, response1, response2, csv_filename="o3-mini-high-validations.csv"):
    # Extract TP, FP, FN from response
    tp1 = response1.summary_counts.TP
    fp1 = response1.summary_counts.FP
    fn1 = response1.summary_counts.FN
    # Calculate precision, recall, and F1-score
    metrics1 = calculate_metrics(tp1, fp1, fn1)

    tp2 = response2.summary_counts.TP
    fp2 = response2.summary_counts.FP
    fn2 = response2.summary_counts.FN
    #be cause we flipped the values, we need to change what metrics we use to calculate
    metrics2 = calculate_metrics(tp2, fn2, fp2)

    avg_metrics = average_metrics(metrics1, metrics2)

    # Format results for CSV
    row = [doctor_index, patient_index, 
           round(avg_metrics["Precision"], 3) if prompt_index == 1 else "", 
           round(avg_metrics["Recall"], 3) if prompt_index == 1 else "", 
           round(avg_metrics["F1-score"], 3) if prompt_index == 1 else "", 
           round(avg_metrics["Precision"], 3) if prompt_index == 2 else "", 
           round(avg_metrics["Recall"], 3) if prompt_index == 2 else "", 
           round(avg_metrics["F1-score"], 3) if prompt_index == 2 else "", 
           round(avg_metrics["Precision"], 3) if prompt_index == 3 else "", 
           round(avg_metrics["Recall"], 3) if prompt_index == 3 else "", 
           round(avg_metrics["F1-score"], 3) if prompt_index == 3 else "", 
           round(avg_metrics["Precision"], 3) if prompt_index == 4 else "", 
           round(avg_metrics["Recall"], 3) if prompt_index == 4 else "", 
           round(avg_metrics["F1-score"], 3) if prompt_index == 4 else "", 
           round(avg_metrics["Precision"], 3) if prompt_index == 5 else "", 
           round(avg_metrics["Recall"], 3) if prompt_index == 5 else "", 
           round(avg_metrics["F1-score"], 3) if prompt_index == 5 else ""]
    
    # Append results to CSV file
    with open(csv_filename, mode="a", newline="") as file:
        writer = csv.writer(file, delimiter="\t")  # Use tab separator
        writer.writerow(row)

# Function to run evaluation for all doctors, patients, and prompts
def run_evaluation(csv_filename="o3-mini-high-validations10.csv",):
    input_prompt = ReadFile("LLM_as_judge_prompt_5.txt")
    # Write the header only once at the start
    with open(csv_filename, mode="w", newline="") as file:
        writer = csv.writer(file, delimiter="\t")
        writer.writerow(["Doctor", "Patient", "Precision (Prompt 1)", "Recall (Prompt 1)", "F1-score (Prompt 1)", 
                         "Precision (Prompt 2)", "Recall (Prompt 2)", "F1-score (Prompt 2)", 
                         "Precision (Prompt 3)", "Recall (Prompt 3)", "F1-score (Prompt 3)",
                         "Precision (Prompt 4)", "Recall (Prompt 4)", "F1-score (Prompt 4)",
                         "Precision (Prompt 5)", "Recall (Prompt 5)", "F1-score (Prompt 5)"])

    # Loop over all prompts, doctors, and patients
    for prompt_index in range(1, 6):  # Prompts 1 to 5
        print(f"Processing Prompt {prompt_index}...")

        for doctor_index in range(1, 11):  # Doctors 1 to 10
            for patient_index in range(1, 11):  # Patients 1 to 10
                start = time.time()
                print(f"Evaluating Doctor {doctor_index}, Patient {patient_index}, Prompt {prompt_index}")

                ai_summary_path = f"summaries/arst_{doctor_index}_patsient_{patient_index}/kokkuvõtted/prompt_{prompt_index}/arst_{doctor_index:02}_patsient_{patient_index:02}_kokkuvõte_prompt_{prompt_index}_a.txt"

                # Load AI and doctor summaries (replace with actual file reading logic)
                doctor_summary_path = f"Arst_{doctor_index:03}/Patsient_{patient_index:03}/toorfailid/arsti_kokkuvote_orig_{doctor_index:02}_{patient_index:02}.txt"
                
                ai_summary = ReadFile(ai_summary_path)
                doctor_summary = ReadFile(doctor_summary_path)

                # Get validation response from LLM
                print("Running first validation")
                response1 = GetLLMValidation(input_prompt, ai_summary, doctor_summary, print_prompt=False)
                print("Running with summaries swapped")
                response2 = GetLLMValidation(input_prompt, doctor_summary, ai_summary, print_prompt=False)

                if response1 and response2:
                    new_path = f"arst_{doctor_index:02}_patsient_{patient_index:02}"
                    save_path = CreateDirectory("validations",new_path)
                    SaveValidation(response1,save_path, doctor_index, patient_index, prompt_index, "o3-mini-high-AI10-first")
                    SaveValidation(response2,save_path, doctor_index, patient_index, prompt_index, "o3-mini-high-doctor10-first")
                    process_and_save_results(doctor_index, patient_index, prompt_index, response1, response2, csv_filename)
                    end = time.time()
                    print(f"Time elapsed: {end-start}")

# Run the evaluation for all doctors, patients, and prompts
run_evaluation()