# Tuning LLM-judge

In [1]:
# Loading train eval sets
import pandas as pd

df_train = pd.read_excel("../../data/train_val_splits/train_data.xlsx")
df_eval = pd.read_excel("../../data/train_val_splits/val_data.xlsx")
print("Length of train set: ",len(df_train))
print("Length of eval set: ",len(df_eval))
df_eval.head()

Length of train set:  14400
Length of eval set:  1600


Unnamed: 0,index,titulo,url,mensaje,fecha,frases,sesgo_pronombre,sesgo_otro,version_con_sesgo,version_sin_sesgo,potencialmente_sesgable_otro_sesgo,revisado,description_bias_removal,biased_words,unbiased_words
0,14186,Atención Secretaría de Estudios jueves 4 de oc...,detalle?id=22733,Se informa a la comunidad estudiantil que el d...,2018-10-03 10:09:03,Se informa a la comunidad estudiantil que el d...,NO,SI,Se informa a todos los estudiantes que el día...,Se informa a la comunidad estudiantil que el d...,SI,SI,"Se reemplazó ""estudiantes"" por ""comunidad estu...",estudiantes,"comunidad estudiantil, Alumn@s"
1,2906,Comunicado FCFM: Extensión vacaciones invierno...,detalle?id=55461,Viernes 17 de junio de 2022 \nEstimada comunid...,2022-06-17 11:22:17,Ésta permite el teletrabajo para funcionarias/...,NO,NO,Ésta permite el teletrabajo para funcionarios ...,Ésta permite el teletrabajo para funcionarias/...,SI,SI,"Se incluyó la perspectiva femenina en ""funcion...","funcionarios, hijos","funcionarias/os, hijas/os"
2,13936,Mañana en la FCFM - lunes 5 de noviembre,detalle?id=23285,"WORKSHOP: \n\n""Workshop en Macroeconomía"". \n\...",2018-10-31 17:07:31,Hora: 16:30 hrs Lugar: Auditorio DIE.,,,,,,SI,,,
3,14121,MAÑANA EN LA FCFM- Miércoles 10 de octubre de ...,detalle?id=22855,"CHARLAS: \n_x000D_\n""Dinámica controlada por l...",2018-10-09 17:19:09,"Lugar: Patio central, Beauchef 851.",,,,,NO,SI,,,
4,23320,"Cierre SEMDA, viernes 18 de enero",detalle?id=5512,Se comunica que el SEMDA Central y los consult...,2013-01-16 16:11:16,En caso de presentar alguna patología de urgen...,,,,,NO,SI,,,


In [2]:
len_notna_train = len(df_train['version_sin_sesgo'].dropna())
print('Rows with unbiased text train: ',len_notna_train,f' ({100*len_notna_train/len(df_train)}%)')
len_notna_eval = len(df_eval['version_sin_sesgo'].dropna())
print('Rows with unbiased text eval: ',len_notna_eval,f' ({100*len_notna_eval/len(df_eval)}%)')

Rows with unbiased text train:  4140  (28.75%)
Rows with unbiased text eval:  443  (27.6875%)


In [3]:
len_notna_train = len(df_train['version_con_sesgo'].dropna())
print('Rows with biased text train: ',len_notna_train,f' ({100*len_notna_train/len(df_train)}%)')
len_notna_eval = len(df_eval['version_con_sesgo'].dropna())
print('Rows with biased text eval: ',len_notna_eval,f' ({100*len_notna_eval/len(df_eval)}%)')

Rows with biased text train:  4140  (28.75%)
Rows with biased text eval:  443  (27.6875%)


In [4]:
df_eval['sesgo_pronombre'].unique()

array(['NO', nan, 'SI', 'No', 'Si'], dtype=object)

In [5]:
df_eval["sesgo_pronombre"] = df_eval["sesgo_pronombre"].replace({"Si": "SI", "No": "NO"})
df_eval['sesgo_pronombre'].value_counts()

sesgo_pronombre
NO    283
SI    160
Name: count, dtype: int64

In [6]:
df_eval['sesgo_otro'].unique()

array(['SI', 'NO', nan, 'No', 'Si', 'nO', 'si', 's'], dtype=object)

In [7]:
df_eval["sesgo_otro"] = df_eval["sesgo_otro"].replace({
    "Si": "SI", "si":"SI", "s":"SI",
    "No": "NO", "nO":"NO"
    })
df_eval['sesgo_otro'].value_counts()

sesgo_otro
NO    265
SI    179
Name: count, dtype: int64

## Three sets for LLM-Judge evaluation

We will evaluate the capacity of the judge in three different categories:
1) distinguish between biased and unbiased text
2) distinguish between a completely unbiased and a partially unbiased text
3) distinguish between text that keeps the semantics, part of the input has been cut out or if it has completely lost its semantics.

For the first problem we will create an eval set with 100 rows that could have bias but don't (unbiased text is given as both input and output), 323 that are biased (biased text is given as input and output is unbiased) and 200 where the text does not admit bias in the first place (input same as output).

In [8]:
def assign_input_output(df_in):
    df = df_in.copy(deep='True')
    df["input"] = None
    df["output"] = None
    df["expected_outcome"] = None
    
    unbiased = "version_sin_sesgo"
    biased = "version_con_sesgo"

    # 323 rows where the text is biased and needs correction (input = biased, output = unbiased)
    biased_rows = df[df[biased].notna()].sample(323, random_state=42)
    df.loc[biased_rows.index, "input"] = biased_rows[biased]
    df.loc[biased_rows.index, "output"] = biased_rows[unbiased]
    df.loc[biased_rows.index, "expected_outcome"] = "(Y)"

    # 200 rows where the text is unbiasable (input = output = message)
    unbiasable_rows = df[df[biased].isna() & df[unbiased].isna()].sample(200, random_state=42)
    df.loc[unbiasable_rows.index, "input"] = unbiasable_rows["mensaje"]
    df.loc[unbiasable_rows.index, "output"] = unbiasable_rows["mensaje"]
    df.loc[unbiasable_rows.index, "expected_outcome"] = "(Z)"

    return df

df_eval = assign_input_output(df_eval)

Example row

In [9]:
df_eval[df_eval["expected_outcome"].notna()].sample(1, random_state=42)

Unnamed: 0,index,titulo,url,mensaje,fecha,frases,sesgo_pronombre,sesgo_otro,version_con_sesgo,version_sin_sesgo,potencialmente_sesgable_otro_sesgo,revisado,description_bias_removal,biased_words,unbiased_words,input,output,expected_outcome
1590,1347,Matrícula 2023 Estudiantes Antiguos  proceso ...,detalle?id=63125,El proceso de Matrícula 2023 para Estudiantes ...,2023-01-31 15:19:31,El proceso de Matrícula 2023 para Estudiantes ...,SI,NO,El proceso de Matrícula 2023 para Estudiantes ...,El proceso de Matrícula 2023 para Estudiantes ...,SI,SI,Se incluyó la forma femenina de los adjetivos ...,Estudiantes Antiguos Rezagados,Estudiantes Antiguas/os Rezagadas/os,El proceso de Matrícula 2023 para Estudiantes ...,El proceso de Matrícula 2023 para Estudiantes ...,(Y)


## Judge evaluation

In [10]:
from mistralai import Mistral, UserMessage

model = "mistral-large-latest"

with open('Mistral_key', 'r') as file:
    mistral_key = file.readline().strip()

client = Mistral(api_key=mistral_key)

In [13]:
from judge import make_message, JUDGE_PROMPT
import time

# Initialize the "Judge" column with NaN values
df_eval['judge_answer'] = None
df_eval['judge_model'] = None
df_eval['judge_prompt'] = None

# Iterate over rows to populate the "Judge" column
for index, row in df_eval.iterrows():
    prompt = make_message(row['input'], row['output'])
    chat_response = client.chat.complete(
        model = model,
        messages = prompt,
    )
    judge_eval = chat_response.choices[0].message.content
    df_eval.at[index, 'judge_answer'] = judge_eval
    df_eval.at[index, 'judge_model'] = model
    df_eval.at[index, 'judge_prompt'] = prompt 

    time.sleep(0.2)  # we will never exceed the rate this way

In [14]:
df_eval.to_csv('judge_eval/20250203_mistral-large-judge_eval.csv', index=False)

In [15]:
len(df_eval)

1600

---

In [1]:
import pandas as pd

df_eval = pd.read_csv("judge_eval/20250203_mistral-large-judge_eval.csv")
len(df_eval)

1600

In [2]:
df_eval = df_eval[df_eval["expected_outcome"].notna()]
len(df_eval)

523

In [3]:
df_eval["expected_outcome"].value_counts()

expected_outcome
(Y)    323
(Z)    200
Name: count, dtype: int64

In [4]:
# fixing mistake, missing rows
# 100 rows where bias is present but we keep the unbiased text (input = output = unbiased)
unbiased_rows = df_eval[df_eval["version_sin_sesgo"].notna()].sample(100, random_state=42)

new_rows = unbiased_rows.copy()
new_rows.loc[unbiased_rows.index, "input"] = new_rows["version_sin_sesgo"]
new_rows.loc[unbiased_rows.index, "output"] = new_rows["version_sin_sesgo"]
new_rows.loc[unbiased_rows.index, "expected_outcome"] = "(Z)"

In [5]:
from mistralai import Mistral, UserMessage

model = "mistral-large-latest"

with open('Mistral_key', 'r') as file:
    mistral_key = file.readline().strip()

client = Mistral(api_key=mistral_key)

In [6]:
import time
from judge import make_message, JUDGE_PROMPT


# Initialize the "Judge" column with NaN values
new_rows['judge_answer'] = None
new_rows['judge_model'] = None
new_rows['judge_prompt'] = None

# Iterate over rows to populate the "Judge" column
for index, row in new_rows.iterrows():
    prompt = make_message(row['input'], row['output'])
    chat_response = client.chat.complete(
        model = model,
        messages = prompt,
    )
    judge_eval = chat_response.choices[0].message.content
    new_rows.at[index, 'judge_answer'] = judge_eval
    new_rows.at[index, 'judge_model'] = model
    new_rows.at[index, 'judge_prompt'] = prompt 

    time.sleep(0.2)  # we will never exceed the rate this way

In [7]:
# Append the new rows to the existing dataframe
df_eval = pd.concat([df_eval, new_rows], ignore_index=True)

In [8]:
df_eval["expected_outcome"].value_counts()

expected_outcome
(Y)    323
(Z)    300
Name: count, dtype: int64

In [9]:
df_eval.to_csv('judge_eval/20250203_mistral-large-judge_eval.csv', index=False)

In [10]:
df_eval_judge = df_eval[['index','mensaje','sesgo_pronombre','sesgo_otro',
                         'version_con_sesgo','version_sin_sesgo','input','output','expected_outcome']]

df_eval_judge.head()

Unnamed: 0,index,mensaje,sesgo_pronombre,sesgo_otro,version_con_sesgo,version_sin_sesgo,input,output,expected_outcome
0,14186,Se informa a la comunidad estudiantil que el d...,NO,SI,Se informa a todos los estudiantes que el día...,Se informa a la comunidad estudiantil que el d...,Se informa a todos los estudiantes que el día...,Se informa a la comunidad estudiantil que el d...,(Y)
1,1755,Estimad@s estudiantes: \nLes esperamos hoy jue...,NO,NO,Estimados alumnos: Los esperamos hoy jueves 24...,Estimad@s estudiantes: Les esperamos hoy jueve...,Estimados alumnos: Los esperamos hoy jueves 24...,Estimad@s estudiantes: Les esperamos hoy jueve...,(Y)
2,17807,Estimada Comunidad: \n\n\n_x000D_\nLa Unidad d...,SI,NO,Para inscribirse pueden llamar al 229780730 o ...,Para inscribirse pueden llamar al 229780730 o ...,Para inscribirse pueden llamar al 229780730 o ...,Para inscribirse pueden llamar al 229780730 o ...,(Y)
3,20964,Se informa a los estudiantes año de ingreso 20...,SI,SI,Se informa a los estudiantes año de ingreso 20...,Se informa a las/os estudiantes año de ingreso...,Se informa a los estudiantes año de ingreso 20...,Se informa a las/os estudiantes año de ingreso...,(Y)
4,920,"Estimada Comunidad, \n\nCompartimos con ustede...",NO,NO,"Estimados alumnos, Compartimos con ustedes inf...","Estimada Comunidad, Compartimos con ustedes in...","Estimados alumnos, Compartimos con ustedes inf...","Estimada Comunidad, Compartimos con ustedes in...",(Y)


In [11]:
df_eval_judge.to_csv('../../data/train_val_splits/val_judge.csv', index=False)