# Validation of the gender representation bias quantification method

Validate the LLM-based gender representation bias quantification method on an annotated dataset.

Using OpenAI API.

In [None]:
import os
import pandas as pd
import re
import csv
from openai import OpenAI
from tqdm.notebook import tqdm

In [None]:
# Parameters
os.environ["OPENAI_API_KEY"] = "paste-your-api-key-here"
model = "gpt-4-turbo"
analysis_file_path = "../../data/validation/"
analysis_file_id = f"{model}.01"
gt_file = "../../data/validation/gt-es.txt"
examples_pathname = "../../data/dataset-analysis/examples-es.txt"
num_sentences = 100 # number of sentences to be analyzed

In [None]:
# Read the ground truth file, process its content, and return a DataFrame
def df_func(gt_file):
    
    data = []

    with open(gt_file, 'r', encoding='utf-8') as file:
        content = file.read()
        blocks = content.split('\n\n')
        for block in blocks:
            sentence, words = block.strip().split(' "')
            sentence = sentence.strip('"')
            words = words.strip('"')
            data.append((sentence, words))

    df = pd.DataFrame(data, columns=['Ejemplo', 'Respuesta'])
    return df

df = df_func(gt_file)

In [None]:
# Read the examples
with open(examples_pathname, 'r', encoding='utf-8') as file:
    examples = file.read()

In [None]:
# Extract sentence from DataFrame
def extract_sentence(df, sentence_id):

    sentence_str = str(df['Ejemplo'][sentence_id])
    return sentence_str

In [None]:
# Add the model response to the DataFrame
def final_df_func(df, response, i):

    final_df = df.copy()
    final_df.loc[i, 'Respuesta modelo'] = response
    return final_df

In [None]:
# Transform the model response into a list of words
def response_trans_func(final_df, i):

    words = re.split(" - |, |\n", final_df["Respuesta modelo"][i])
    word_list = [words[i:i+3] for i in range(0, len(words), 3)]
    return word_list


In [None]:
# Read the ground truth file, process its content, and return a DataFrame
def df_trans_func(gt_file):

    data = []

    with open(gt_file, 'r', encoding='utf-8') as file:
        content = file.read()
        blocks = content.split('\n\n')

        for block in blocks:
            sentence, words = block.strip().split(' "')
            sentence = sentence.strip('"')
            words = words.strip('"')
            words = words.upper()
            words = re.split(" - |, |\n", words)
            word_list = [words[i:i+3] for i in range(0, len(words), 3)]
            data.append((sentence, word_list))

    df = pd.DataFrame(data, columns=['Ejemplo', 'Respuesta'])
    return df

df_trans_func(gt_file)

In [None]:
# Compare the model response with the ground truth
def analysis(index, word_list, gt):
    
    identified = [] # Words correctly identified and correctly classified in both attributes (n_c)
    not_identified = [] # Words missed (not identified) by the model (n_m)
    wrongly_analyzed = [] # Words correctly identified but incorrectly classified in at least one attribute (n_i)
    wrongly_identified = word_list.copy() # Extra words that do not appear in the ground truth but were returned by the model (n_e)

    for k in range(len(gt[index])):
        if gt[index][k] in wrongly_identified:
            identified.append(gt[index][k])
            position = wrongly_identified.index(gt[index][k])
            del wrongly_identified[position]
        elif gt[index][k][0] not in [sublista[0] for sublista in wrongly_identified]:
            not_identified.append(gt[index][k])
        elif gt[index][k] not in wrongly_identified and gt[index][k][0] in [sublista[0] for sublista in wrongly_identified]:
            wrongly_analyzed.append(gt[index][k])
            for sublist in wrongly_identified:
                if gt[index][k][0] == sublist[0]:
                    wrongly_identified.remove(sublist)

    return identified, not_identified, wrongly_analyzed, wrongly_identified

In [None]:
# Analysis wrapper
def analysis_wrapper(final_df, i):

    word_list = response_trans_func(final_df, i)
    gt = df_trans_func(gt_file)["Respuesta"]
    identified, not_identified, wrongly_analyzed, wrongly_identified  = analysis(i, word_list, gt)
    
    return f'{i+1} | {extract_sentence(df, i)} | {df_trans_func(gt_file)["Respuesta"][i]} | {word_list} | {len(identified)} | {len(not_identified)} | {len(wrongly_analyzed)} | {len(wrongly_identified)} | {identified} | {not_identified} | {wrongly_analyzed} | {wrongly_identified}'

In [None]:
# Connect to OpenAI API
client = OpenAI()

# Analyze the sentences with the model
with open(f"aux-model-output-{analysis_file_id}.csv", mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for i in tqdm(range(num_sentences), desc="Processing sentences"):

        sentence = extract_sentence(df, i) # Frase a analizar
        
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"{examples}\\nFrase: {sentence}\\nInstrucciones: Identifica todos los sustantivos y pronombres en la frase proporcionada. Para cada uno, determina si se refiere a un ser humano (S) o no (N), y especifica su género gramatical: masculino (M) o femenino (F). Excluye los apellidos. Sigue el formato de los ejemplos proporcionados sin añadir texto adicional."}
            ]
        )
        
        response = response.choices[0].message.content
        writer.writerow([f'{i+1} | {sentence} | {df["Respuesta"][i]} | {re.sub(r'\s+\n', '\n', response.strip().upper())}'])
        file.flush()

response_model = f"aux-model-output-{analysis_file_id}.csv"


In [None]:
# Process the model response
def df_model_process(response_model):
    df_model = pd.read_csv(response_model, header=None)
    df_model = df_model[0].str.split('|', expand=True)
    df_model.columns = ['ID', 'Frase', 'GT', 'Respuesta']

    # Remove whitespaces
    df_model = df_model.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    return df_model

df_model = df_model_process(response_model)


In [None]:
# Model evaluation
def evaluation(num_sentences):

    with open(f"aux-validation-{analysis_file_id}.csv", mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for i in range(num_sentences):
            response = df_model["Respuesta"][i]
            final_df = final_df_func(df, response, i)
            final = analysis_wrapper(final_df, i)
            writer.writerow([final])

    df_complete = pd.read_csv(f'aux-validation-{analysis_file_id}.csv', header=None)

    # Divide every row in 4 parts using the '|' separator
    df_complete = df_complete[0].str.split('|', expand=True)

    df_complete.columns = ['ID', 'Frase', 'GT', 'Respuesta', 'N identificadas', 'N no identificadas', 'N mal analizadas', 'N mal identificadas', 'Identificadas', 'No identificadas', 'Mal analizadas', 'Mal identificadas']

    # Remove whitespaces
    df_complete = df_complete.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    return df_complete

evaluation = evaluation(num_sentences)

In [None]:
# Remove auxiliary files
os.remove(f"aux-model-output-{analysis_file_id}.csv")
os.remove(f"aux-validation-{analysis_file_id}.csv")

# Write output to Excel file
with pd.ExcelWriter(os.path.join(analysis_file_path, f'validation-{analysis_file_id}.xlsx')) as writer:
    evaluation.to_excel(writer, index=False)