# Debiaser agent


In [1]:
import time
import logging 
from pathlib import Path

import pandas as pd

from debiasing.llm.agent import Debiaser
from debiasing.llm.models import LLMMessage

logger = logging.getLogger()
logger.setLevel(logging.ERROR)

We will create an agent named `Debiaser` with the capabilities to debiasing a gender biased text. So the task is to process any text, detect which (if any) parts of the text contain a gender bias, and then proposed debiased alternatives for that text. For that purpose we will use two tools:

1. `GENDER_BIAS_MULTI_LABEL_CLASSIFIER`: a kind of NER-multilabel classifier tool that detect text within a paragraph for a set of possible gender biases.

1. `DEBIASER`: a kind of sequence-to-sequence transfer style for biased-to-debiased gender text.

So, **what is an agent?** A piece of code that use an LLM in chat completion mode, a bunch of tools, reasoning, planning, an a while loop to solve a given task.

In [2]:
# Initialize the Debiaser agent given the LLM provider
debiaser = Debiaser(provider="anthropic")

# Toy example inputs
input_text = (
    "La karencita es tan tierna. "
    "Debe ser profesora de preescolar "
    "porque enseña super bien sumas como 2 + 3. "
    "Además no creo que sea ingenieria "
    "porque a las mujeres no les gusta eso"
)

# input_text = "Hola, cómo estas?"

# Obtain the agent response given the input text, includes the LLM responses summoned by the
# debiaser, tools activated, and the final debiased response
agent_response = debiaser.execute_task([
    LLMMessage(
        role=LLMMessage.MessageRole.USER,
        content=input_text
    )
])

# Get the agent response given the input text
agent_response.model_dump()

{'tool_activations': [{'tool_name': 'gender_bias_classifier',
   'tool_results': {'bias_labels': ['STEREOTYPING_BIAS',
     'SEXISM',
     'SEMANTIC_BIAS'],
    'bias_texts': ['a las mujeres no les gusta eso',
     'Debe ser profesora de preescolar porque enseña super bien sumas'],
    'scores': [0.9, 0.8]},
   'step_id': 1},
  {'tool_name': 'debiaser',
   'tool_results': {'debiasing_text': 'La persona es muy buena explicando matemáticas básicas. Puede ser un excelente docente de preescolar, y tiene habilidades para enseñar conceptos matemáticos de manera clara y efectiva.',
    'reasoning': ["Removed gendered diminutive 'karencita'",
     "Eliminated stereotyping about women's career choices",
     "Focused on the person's teaching skills regardless of gender",
     'Used gender-neutral language to describe the teaching ability']},
   'step_id': 2}],
 'messages': [{'role': <MessageRole.USER: 'user'>,
   'content': 'La karencita es tan tierna. Debe ser profesora de preescolar porque en

In [3]:
# Obtain the agent prediction given the input text, i.e. standard debiasing response regardless of the LLM responses
# and tools activated
agent_prediction = debiaser.prediction(input_text)
agent_prediction.model_dump()

{'input': 'La karencita es tan tierna. Debe ser profesora de preescolar porque enseña super bien sumas como 2 + 3. Además no creo que sea ingenieria porque a las mujeres no les gusta eso',
 'biases': 'STEREOTYPING_BIAS, SEXISM, SEMANTIC_BIAS',
 'scores': '0.9, 0.8',
 'debias_reasoning': "Removed gendered diminutive 'karencita', Eliminated stereotype about women and engineering, Focused on the person's teaching skills without gender assumptions, Used gender-neutral language to describe professional capabilities",
 'output': 'La persona es muy hábil para enseñar matemáticas básicas. Es excelente explicando conceptos como 2 + 3, y podría ser efectiva en diversos campos profesionales, incluyendo la educación o la ingeniería, dependiendo de su interés y formación.'}

In [4]:
pd.DataFrame([agent_prediction.model_dump()])

Unnamed: 0,input,biases,scores,debias_reasoning,output
0,La karencita es tan tierna. Debe ser profesora...,"STEREOTYPING_BIAS, SEXISM, SEMANTIC_BIAS","0.9, 0.8","Removed gendered diminutive 'karencita', Elimi...",La persona es muy hábil para enseñar matemátic...


## Collect predictions from the dataset


Data: https://drive.google.com/drive/folders/1IJVr4eZOo_J6txGfHlFwfhdSjrwdfEiQ

In [5]:
data_path = '../../data/augmented'
file_path = Path(data_path)

dataframes = []
for file in file_path.glob('*.xlsx'):
    df = pd.read_excel(file)
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.shape

(16000, 15)

In [6]:
combined_df.head(n=1)

Unnamed: 0,index,titulo,url,mensaje,fecha,frases,sesgo_pronombre,sesgo_otro,version_con_sesgo,version_sin_sesgo,potencialmente_sesgable_otro_sesgo,revisado,description_bias_removal,biased_words,unbiased_words
0,21000,ENTREGA TARJETA BAES ALUMNOS AÑO DE INGRESO 2...,detalle?id=8391,Se informa a los estudiantes que los días mart...,2015-03-09 18:23:09,Horario de atención de lunes a jueves de 09:00...,,,,,NO,SI,,,


In [7]:
# Obs with biases
combined_df.sesgo_pronombre.value_counts()

sesgo_pronombre
NO     2604
SI     1405
Si      335
No      320
a         2
SI        1
SI0       1
Name: count, dtype: int64

In [8]:
# Create a function to replace Si, SI, SI0 to si
def preprocess_text(text):
    if not isinstance(text, str):
        return text
    text = text.lower().strip()

    # standardize si
    text = text.replace('si0', 'si')
    text = text.replace('si', 'si')
    text = text.replace('a', 'si')

    return text


combined_df.sesgo_pronombre = combined_df.sesgo_pronombre.apply(preprocess_text)
combined_df.sesgo_pronombre.value_counts()


sesgo_pronombre
no    2924
si    1744
Name: count, dtype: int64

In [9]:
debiaser.prediction(
    combined_df.loc[combined_df.sesgo_pronombre == 'si', 'version_con_sesgo'].iloc[0]
).model_dump()

{'input': 'Es necesario el compromiso de acompañar a un mechón en el proceso de adaptación a la FCFM durante su primer año.',
 'biases': 'GENERIC_PRONOUNS, STEREOTYPING_BIAS, EXCLUSIONARY_TERMS',
 'scores': '0.7',
 'debias_reasoning': "The original text used 'mechón' which is a gendered term specific to Chilean Spanish referring to first-year students. To debias the text, I replaced it with the gender-neutral term 'estudiantes de primer año' to make the language more inclusive and avoid gender-specific terminology.",
 'output': 'Es necesario el compromiso de acompañar a los estudiantes de primer año en el proceso de adaptación a la FCFM.'}

In [10]:
debiaser.prediction(
    combined_df.loc[combined_df.sesgo_pronombre == 'si', 'version_con_sesgo'].iloc[10]
).model_dump()

{'input': 'Se informa a los alumnos, que entre los días 2 y 11 de marzo se realizarán las entrevistas con Asistente Social para  todos los estudiantes interesados en  postular a la Beca Indígena para el año 2015.',
 'biases': 'GENERIC_PRONOUNS, EXCLUSIONARY_TERMS',
 'scores': '0.6, 0.5',
 'debias_reasoning': "Replaced 'alumnos' with the gender-neutral term 'estudiantes', Changed 'todos los estudiantes' to 'todas las personas' to make the language more inclusive and avoid generic masculine pronouns",
 'output': 'Se informa a los estudiantes, que entre los días 2 y 11 de marzo se realizarán las entrevistas con Asistente Social para todas las personas interesadas en postular a la Beca Indígena para el año 2015.'}

In [11]:
debiaser.prediction(
    combined_df.loc[combined_df.sesgo_pronombre == 'si', 'version_con_sesgo'].iloc[21]
).model_dump()

{'input': 'Estimados alumnos,  Ya se encuentran disponibles los resultados del primer proceso de modificación de la Inscripción académica de Verano.',
 'biases': 'EXCLUSIONARY_TERMS',
 'scores': '0.8',
 'debias_reasoning': "The original text used 'Estimados alumnos' which is a gendered term that exclusively uses the masculine form. This was replaced with 'estudiantes', a gender-neutral term that includes all students regardless of gender.",
 'output': 'Estimados estudiantes, Ya se encuentran disponibles los resultados del primer proceso de modificación de la Inscripción académica de Verano.'}

In [12]:
debiaser.prediction(
    combined_df.loc[combined_df.sesgo_pronombre == 'no', 'version_con_sesgo'].iloc[0]
).model_dump()

{'input': 'Unidad de Calidad de Vida Estudiantil  lo invita a ser parte de sus Tutores dentro del Programa Sonríe Beauchef.',
 'biases': 'GENERIC_PRONOUNS, STEREOTYPING_BIAS, SEXISM, EXCLUSIONARY_TERMS, SEMANTIC_BIAS',
 'scores': '0.3, 0.2',
 'debias_reasoning': "The original text used the masculine form 'Tutores' which implies a gender bias. To debias the text, I added the feminine form 'Tutoras' to make the invitation inclusive of all genders. This ensures that the text acknowledges both male and female participants in the tutoring program.",
 'output': 'Unidad de Calidad de Vida Estudiantil lo invita a ser parte de sus Tutores y Tutoras dentro del Programa Sonríe Beauchef.'}

In [33]:
combined_df.loc[2239]

index                                                                              2239
titulo                                FCFM campeona de los Juegos Olímpicos Estudian...
url                                                                    detalle?id=59025
mensaje                               Nuestra Facultad de Ciencias Físicas y Matemát...
fecha                                                               2022-09-22 11:24:22
frases                                Nuestra Facultad de Ciencias Físicas y Matemát...
sesgo_pronombre                                                                     NaN
sesgo_otro                                                                          NaN
version_con_sesgo                                                                   NaN
version_sin_sesgo                                                                   NaN
potencialmente_sesgable_otro_sesgo                                                   NO
revisado                        

In [14]:

preds = []
idx_problems = []
problems = []

save_interval = 100

# Iterate over the dataframe and get the predictions
for idx in range(combined_df.shape[0]):
    start = time.time()
    # if idx > 4:
    #     break
    try:
        debiaser_pred = debiaser.prediction(combined_df.loc[idx, 'frases']).model_dump()
        debiaser_pred['index'] = combined_df.loc[idx, 'index']
        debiaser_pred['url'] = combined_df.loc[idx, 'url']
        preds.append(debiaser_pred)
    except Exception as e:
        print(f'Error at index {idx}: {e}')
        idx_problems.append(idx)
        problems.append(e)
    elapsed_time = time.time() - start
    print(f'Elapsed time for processing obs #{idx}: {elapsed_time:.2f} seconds\n-----')

    # Save the dataframe every save_interval obs
    if (idx + 1) % save_interval == 0:
        preds_df = pd.DataFrame(preds)
        preds_df.to_csv('predictions.csv', index=False)
        print(f'Saved predictions up to index {idx}')

        # Save the problems, if any
        if len(idx_problems) > 0:
            pd.DataFrame({
                'index': combined_df.loc[idx, 'index'],
                'problem': problems
            }).to_csv('problems.csv', index=False)


# Save the final predictions
preds_df = pd.DataFrame(preds)
preds_df.to_csv('predictions.csv', index=False)

# Save the problems, if any
if len(idx_problems) > 0:
    pd.DataFrame({
        'index': idx_problems,
        'problem': problems
    }).to_csv('problems.csv', index=False)

Elapsed time for processing obs #0: 6.00 seconds
-----
Elapsed time for processing obs #1: 3.12 seconds
-----
Elapsed time for processing obs #2: 4.48 seconds
-----
Elapsed time for processing obs #3: 2.67 seconds
-----
Elapsed time for processing obs #4: 2.62 seconds
-----
Elapsed time for processing obs #5: 5.12 seconds
-----
Elapsed time for processing obs #6: 6.66 seconds
-----
Elapsed time for processing obs #7: 8.10 seconds
-----
Elapsed time for processing obs #8: 2.98 seconds
-----
Elapsed time for processing obs #9: 7.43 seconds
-----
Elapsed time for processing obs #10: 7.32 seconds
-----
Elapsed time for processing obs #11: 4.68 seconds
-----
Elapsed time for processing obs #12: 10.73 seconds
-----
Error at index 13: 'reasoning'
Elapsed time for processing obs #13: 5.78 seconds
-----
Elapsed time for processing obs #14: 3.04 seconds
-----
Elapsed time for processing obs #15: 2.83 seconds
-----
Elapsed time for processing obs #16: 2.71 seconds
-----
Elapsed time for processin

2025-01-21 00:16:10 - debiasing.configs - ERROR - Debiasing tool not activated


Error at index 54: Debiasing tool not activated
Elapsed time for processing obs #54: 5.89 seconds
-----
Elapsed time for processing obs #55: 5.38 seconds
-----
Elapsed time for processing obs #56: 2.86 seconds
-----
Elapsed time for processing obs #57: 2.70 seconds
-----
Elapsed time for processing obs #58: 6.84 seconds
-----
Elapsed time for processing obs #59: 2.96 seconds
-----
Elapsed time for processing obs #60: 2.57 seconds
-----
Elapsed time for processing obs #61: 4.40 seconds
-----
Elapsed time for processing obs #62: 8.76 seconds
-----
Elapsed time for processing obs #63: 2.87 seconds
-----
Elapsed time for processing obs #64: 6.64 seconds
-----
Elapsed time for processing obs #65: 6.74 seconds
-----
Elapsed time for processing obs #66: 7.63 seconds
-----
Elapsed time for processing obs #67: 2.80 seconds
-----
Elapsed time for processing obs #68: 2.93 seconds
-----
Elapsed time for processing obs #69: 3.21 seconds
-----
Elapsed time for processing obs #70: 3.02 seconds
-----


2025-01-21 01:09:53 - debiasing.configs - ERROR - Debiasing tool not activated


Error at index 749: Debiasing tool not activated
Elapsed time for processing obs #749: 9.47 seconds
-----
Elapsed time for processing obs #750: 6.51 seconds
-----
Elapsed time for processing obs #751: 2.42 seconds
-----
Elapsed time for processing obs #752: 6.68 seconds
-----
Elapsed time for processing obs #753: 6.83 seconds
-----
Elapsed time for processing obs #754: 7.52 seconds
-----
Elapsed time for processing obs #755: 7.90 seconds
-----
Elapsed time for processing obs #756: 6.88 seconds
-----
Elapsed time for processing obs #757: 6.45 seconds
-----
Elapsed time for processing obs #758: 17.16 seconds
-----
Elapsed time for processing obs #759: 3.18 seconds
-----
Elapsed time for processing obs #760: 7.16 seconds
-----
Elapsed time for processing obs #761: 3.85 seconds
-----
Elapsed time for processing obs #762: 10.09 seconds
-----
Elapsed time for processing obs #763: 7.15 seconds
-----
Elapsed time for processing obs #764: 7.08 seconds
-----
Elapsed time for processing obs #765:

2025-01-21 01:28:43 - debiasing.configs - ERROR - Debiasing tool not activated


Error at index 975: Debiasing tool not activated
Elapsed time for processing obs #975: 7.06 seconds
-----
Elapsed time for processing obs #976: 2.76 seconds
-----
Elapsed time for processing obs #977: 2.97 seconds
-----
Elapsed time for processing obs #978: 15.44 seconds
-----
Elapsed time for processing obs #979: 2.94 seconds
-----
Elapsed time for processing obs #980: 3.13 seconds
-----
Elapsed time for processing obs #981: 5.23 seconds
-----
Elapsed time for processing obs #982: 7.16 seconds
-----
Elapsed time for processing obs #983: 5.33 seconds
-----
Elapsed time for processing obs #984: 2.60 seconds
-----
Elapsed time for processing obs #985: 2.82 seconds
-----
Elapsed time for processing obs #986: 2.77 seconds
-----
Elapsed time for processing obs #987: 3.79 seconds
-----
Elapsed time for processing obs #988: 2.84 seconds
-----
Elapsed time for processing obs #989: 3.00 seconds
-----
Elapsed time for processing obs #990: 2.67 seconds
-----
Elapsed time for processing obs #991: 

2025-01-21 01:35:43 - debiasing.configs - ERROR - Debiasing tool not activated


Error at index 1065: Debiasing tool not activated
Elapsed time for processing obs #1065: 5.94 seconds
-----


2025-01-21 01:35:49 - debiasing.configs - ERROR - Debiasing tool not activated


Error at index 1066: Debiasing tool not activated
Elapsed time for processing obs #1066: 5.64 seconds
-----
Elapsed time for processing obs #1067: 2.42 seconds
-----
Elapsed time for processing obs #1068: 8.43 seconds
-----
Elapsed time for processing obs #1069: 8.34 seconds
-----
Elapsed time for processing obs #1070: 6.37 seconds
-----
Elapsed time for processing obs #1071: 2.59 seconds
-----
Elapsed time for processing obs #1072: 7.23 seconds
-----
Elapsed time for processing obs #1073: 6.59 seconds
-----
Elapsed time for processing obs #1074: 15.00 seconds
-----
Elapsed time for processing obs #1075: 3.12 seconds
-----
Elapsed time for processing obs #1076: 3.21 seconds
-----
Error at index 1077: 'reasoning'
Elapsed time for processing obs #1077: 7.48 seconds
-----
Elapsed time for processing obs #1078: 6.07 seconds
-----
Elapsed time for processing obs #1079: 2.69 seconds
-----
Elapsed time for processing obs #1080: 7.35 seconds
-----
Elapsed time for processing obs #1081: 2.76 se

In [17]:
debiaser_results = pd.read_csv('predictions.csv')
debiaser_results.head(n=4)

Unnamed: 0,input,biases,scores,debias_reasoning,output,index,url
0,Horario de atención de lunes a jueves de 09:00...,UNBIASED,1.0,,UNBIASED,21000,detalle?id=8391
1,"El Segundo proceso de modificación, permanecer...",UNBIASED,1.0,,UNBIASED,21001,detalle?id=8389
2,Unidad de Calidad de Vida Estudiantil te invit...,UNBIASED,1.0,,UNBIASED,21002,detalle?id=8385
3,Es necesario el compromiso de acompañar a un m...,UNBIASED,1.0,,UNBIASED,21003,detalle?id=8385


In [23]:
# Ejemplos con sesgo
debiaser_results[debiaser_results.output != 'UNBIASED'].shape

(290, 7)

In [32]:
debiaser_results[debiaser_results.output != 'UNBIASED'].iloc[100].loc[['input', 'output', 'biases', 'debias_reasoning']]

input               El tutor debe tener una disponibilidad de 6 ho...
output              La persona tutora debe tener una disponibilida...
biases                           GENERIC_PRONOUNS, EXCLUSIONARY_TERMS
debias_reasoning    Replaced 'El tutor' with 'La persona tutora' t...
Name: 349, dtype: object