# Evaluation using BERTscore

[Source](https://huggingface.co/spaces/evaluate-metric/bertscore)

In [1]:
import pandas as pd

df = pd.read_csv("../../data/processed/20231220_metrics_CAUSAL.csv")

In [2]:
import ast
from collections import Counter

df['target'] = df['target'].apply(lambda text: ast.literal_eval(text))
df['ID_row'] = df.index  # column to keep track of the original row
df = df.explode('target', ignore_index=True)
get_gen = lambda text: text.split('\n')[2].replace('    <assistant>: ','')
df['output'] = df['generation'].apply(get_gen)

In [3]:
from evaluate import load
bertscore = load("bertscore")

Full evaluation

In [4]:
results = bertscore.compute(predictions=df['output'], references=df['target'], model_type="bert-base-multilingual-cased")

In [5]:
df['precision'] = results['precision']
df['recall'] = results['recall']
df['f1'] = results['f1']

In [6]:
def filter_best_scores(df, id_column, score_column):
    """
    Reduces the dataframe to only the rows with the best score for each ID.

    Parameters:
    - df (pd.DataFrame): The original dataframe
    - id_column (str): Name of the column containing unique IDs
    - score_column (str): Name of the column containing the scores

    Returns:
    - pd.DataFrame: A filtered dataframe with the best score per ID
    """
    # Find the maximum score for each ID_row
    best_scores = df.groupby(id_column)[score_column].transform('max')
    
    # Filter the rows where the score matches the maximum score for each ID_row
    filtered_df = df[df[score_column] == best_scores].copy(deep=True)
    
    return filtered_df

In [7]:
df_results = filter_best_scores(df, 'ID_row', 'f1')

In [8]:
import numpy as np

for k in results.keys():
    if not k == 'hashcode':
        print(f'{k}',f'\n\tmean: {np.mean(df_results[k])}\n\tstd: {np.std(df_results[k])}\n')

precision 
	mean: 0.9984012772817441
	std: 0.008954700796637778

recall 
	mean: 0.997340931459461
	std: 0.0178454148136428

f1 
	mean: 0.9978308397943102
	std: 0.013742493489760968



Subtracting input

In [9]:
results_input = bertscore.compute(
    predictions=df_results['input'],
    references=df_results['target'], model_type="bert-base-multilingual-cased")
df_results['precision_input'] = results_input['precision']
df_results['recall_input'] = results_input['recall']
df_results['f1_input'] = results_input['f1']

df_results['precision_diff'] = df_results['precision'] - df_results['precision_input']
df_results['recall_diff'] = df_results['recall'] - df_results['recall_input']
df_results['f1_diff'] = df_results['f1'] - df_results['f1_input']

In [10]:
for k in results.keys():
    if not k == 'hashcode':
        print(f'{k}',f"\n\tmean (diff): {np.mean(df_results[k+'_diff'])}\n\tstd (diff): {np.std(df_results[k+'_diff'])}\n")

precision 
	mean (diff): -0.0005592012496860436
	std (diff): 0.0080156009561033

recall 
	mean (diff): -0.0004410390811198203
	std (diff): 0.017936119889381312

f1 
	mean (diff): -0.0005302721124780757
	std (diff): 0.013393888082191757



# Analysing cases

In [11]:
def display_lowest_text(df, score_col, k):
    # Sort the dataframe by the score column
    sorted_df = df.sort_values(by=score_col, ascending=True)
    
    # Select the k rows with the lowest scores
    lowest_rows = sorted_df.head(k)

    for i in range(k):
        print("Original input:\t\t",lowest_rows['input'].tolist()[i],'\n',
              "Closest target:\t",lowest_rows['target'].tolist()[i],'\n',
              "Generation:\t\t",lowest_rows['output'].tolist()[i],'\n',
              f'BERTScore {score_col}:\t',lowest_rows[score_col].tolist()[i],'\n\n')
    
# Call the function
display_lowest_text(df_results, 'f1', k=10)

Original input:		 Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb 
 Closest target:	 Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb 
 Generation:		 Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIz 
 BERTScore f1:	 0.7066790461540

In [12]:
# Call the function
display_lowest_text(df_results, 'f1_diff', k=5)

Original input:		 Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb 
 Closest target:	 Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb 
 Generation:		 Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIz 
 BERTScore f1_diff:	 -0.2933208

In [13]:
def display_highest_text(df, score_col, k):
    # Sort the dataframe by the score column
    sorted_df = df.sort_values(by=score_col, ascending=False)
    
    # Select the k rows with the lowest scores
    lowest_rows = sorted_df.head(k)

    for i in range(k):
        print("Original input:\t\t",lowest_rows['input'].tolist()[i],'\n',
              "Closest target:\t",lowest_rows['target'].tolist()[i],'\n',
              "Generation:\t\t",lowest_rows['output'].tolist()[i],'\n',
              'BERTScore f1:\t',lowest_rows[score_col].tolist()[i],'\n\n')
    
# Call the function
display_highest_text(df_results, 'f1_diff', k=2)

Original input:		 Los cursos permiten a los futuros estudiantes extranjeros sentar unas bases sólidas para sus estudios. 
 Closest target:	 Los cursos permiten a las/os futuras/os estudiantes extranjeras/os sentar unas bases sólidas para sus estudios. 
 Generation:		 Los cursos permiten a las/os futuras/os estudiantes extranjeras/os sentar unas bases sólidas para sus estudios. 
 BERTScore f1:	 0.10054647922515869 


Original input:		 El número de consejero son 4. 
 Closest target:	 El número de consejero(a) son 4. 
 Generation:		 El número de consejero(a) son 4. 
 BERTScore f1:	 0.06845730543136597 


