In [None]:
%pip install openai

In [None]:
import openai
import os

### Helper functions

In [None]:
def save_result(path_name, file_name, result):
    """
    Save the result to a file.
    
    Args:
        path_name (str): Path to save the file
        file_name (str): Name of the file
        result (str): Content to save
    """
    file_path = os.path.join(path_name, f"{file_name}.md")
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    with open(file_path, 'w') as file:
        file.write(result)
        
def setup_environment():
    """Set up the environment variables."""
    try:
        api_key = os.environ["GPT_KEY"]
    except KeyError:
        print("Please set the environment variable GPT_KEY")
        api_key = input("Enter your OpenAI API key: ")
        openai.api_key = api_key

def generate_response(messages):
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.3,
        seed=25   
    )

    print(response.usage["total_tokens"])
    
    return response.choices[0].message.content

In [None]:
def extract_false_positive(text):
    false_positive = None
    
    # Split the text into lines and iterate through them
    lines = text.splitlines()
    
    for line in lines:
        if "FP:" in line:
            parts = line.split("FP:", 1)
            if len(parts) > 1:
                false_positive = int(parts[1].strip())
                break
    
    return false_positive


## 1. Set up the model

In [None]:
setup_environment()

## 2. Set file directories

### Sample files and templates for the LLM

In [None]:
case_name = "norauto"
file_name = "model-card-updated"
    
if file_name == "model-card-updated":
    with open(f'files/model-cards-final/{case_name}-{file_name}.md', 'r') as file:
        ground_truth = file.read()
    
    with open(f'../Steps-v3-thesis/files/model-cards-final/{case_name}-{file_name}.md', 'r') as file:
        generated_analysis = file.read()
else:
    with open(f'files/cases/{case_name}/{file_name}.md', 'r') as file:
        ground_truth = file.read()
    
    with open(f'../Steps-v3-thesis/files/cases/{case_name}/{file_name}.md', 'r') as file:
        generated_analysis = file.read()

with open(f'files/grading-rules/grading_rules.md', 'r') as file:
    grading_rules = file.read()

# 1. Summarize texts

In [None]:
messages = [
    {"role": "system", "content": "You are an expert in converting detailed texts into concise, granular bullet points. Avoid creating sub-lists, all points must be on the same indentation level. You will be given a text and your task is to summarize the provided it into a numerical bullet point list. Ensure that each bullet point is clear, complete, and independent, capturing all critical details without losing essential information."},
    {"role": "user", "content": "Text: \n{text}"},
]

In [None]:
# Store the original template
original_template = "Text: \n{text}"

## Summarize ground truth

In [None]:
messages[1]["content"] = original_template.format(text=ground_truth)

In [None]:
%%time
bullet_list_ground = generate_response(messages)
print(bullet_list_ground)

## Summarize generated analysis

In [None]:
messages[1]["content"] = original_template.format(text=generated_analysis)

In [None]:
%%time
bullet_list_generated = generate_response(messages)
print(bullet_list_generated)

# 2. Compare texts

In [None]:
system_template = """You are an expert in fairness evaluation. You will be provided with a summarized list and an unedited on the same topic. Your task is to compare each element of the list with the unedited to determine if there is a semantically similar match.
     Follow additional rules that are under the {file_name} title when matching them.
     **Definition of Similar:** Two points are considered "similar" if they convey the same core idea or meaning, even if the wording is different. A point is semantically similar if the text addresses the same subject, intent, or concept as the corresponding list element.

     For each element point in the list, follow these steps:
     1. Identify if a semantically similar point exists in the unedited text.
     2. If a match is found, classify it as a full match or partial match based on the similarity.
     3. If no match is found, mark it accordingly and justify.

     Each ground truth point should be evaluated independently; avoid merging or separating points. Keep the enumeration format.
     Use the following structure for reporting:

     1. List element / Semantically similar text point (Full Match)
     2. List element / Same topic but semantically different text point (Partial Match)
     3. List element / No match in the text analysis (No Match)

     After evaluation, count the number of total points in the list, Full Match, Partial Match and No Match.
     {special_mode}
     """

user_template = "Rules:\n{grading_rules}\nSummarized list:\n{bullet_list}\nUnedited text:\n{unedited_text}"

In [None]:
messages = [
    {"role": "system", "content": system_template.format(file_name=file_name, special_mode="Add a new field called 'FP' which is equal to the number of No Match found. In a new line, use the following format: 'FP:'")},
    {"role": "user", "content": user_template.format(grading_rules=grading_rules, bullet_list=bullet_list_generated, unedited_text=ground_truth)},
] 

## Summarized generated analysis vs ground truth

In [None]:
%time
matches = generate_response(messages)
print(matches)

In [None]:
extract_false_positive(matches)

In [None]:
false_positive = extract_false_positive(matches)

## Summarized ground truth vs generated analysis

In [None]:
special_mode = f"""After the evaluation, calculate precision, recall and F1 score by using these definitions:
     - TP = Number of Full Match + 0.5 * Number of Partial Match
     - FP = {false_positive}
     - FN = Number of No Match
     """

In [None]:
messages = [
    {"role": "system", "content": system_template.format(file_name=file_name, special_mode=special_mode)},
    {"role": "user", "content": user_template.format(grading_rules=grading_rules, bullet_list=bullet_list_ground, unedited_text=generated_analysis)},
] 

In [None]:
%%time
eval_result = generate_response(messages)
print(eval_result)

case_path = "files/evaluations/" + case_name + "/"
evaluation_name = case_name + "-" + file_name + "-evaluation"

save_result(case_path, evaluation_name, eval_result)