In [1]:
import re
from collections import Counter

In [12]:
def B3_0(mentions, true_clusters, pred_clusters):
  '''
  Calculates precision, recall, and optionally F1 for the  B3(0) metric,
  based on formulation in https://aclanthology.org/W10-4305.pdf

  returns precision, recall and f1 as lists for the input sentence
  '''

  precision_scores = []
  recall_scores = []
  f1_scores = []

  for mention in mentions:
    precision = 0
    recall = 0

    # finding key and response clusters to look at (first cluster to come up that contains current mention)
    mention_key_cluster = None
    for cluster in true_clusters:
      if mention in cluster:
        mention_key_cluster = cluster
        break
    assert mention_key_cluster, "At least one true cluster must contain mention!"

    mention_pred_cluster = None
    for cluster in pred_clusters:
      if mention in cluster:
        mention_response_cluster = cluster
        break

    intersection_key_response = list((Counter(mention_key_cluster) & Counter(mention_response_cluster)).elements())
    overlap_count = len(intersection_key_response)

    # response cluster could be empty if mention was not predicted for any cluster (twinless mention); in this case precision and recall both at 0
    if mention_response_cluster:
      precision = overlap_count / len(mention_response_cluster)
      recall = overlap_count / len(mention_key_cluster)

    precision_scores.append(precision)
    recall_scores.append(recall)
# Check for zero division
    if precision + recall != 0:
        f1_scores.append((2 * precision * recall) / (precision + recall))
    else:
        f1_scores.append(0.0)
  return precision_scores, recall_scores, f1_scores
def global_B3_0(precision_scores, recall_scores, F1_scores):
  '''
  Calculates global precision, recall and F1 scores based on lists of
  individual B3_0 precision/recall/F1 scores per mention
  '''

  B3_0_precision = sum(precision_scores)/len(precision_scores)
  B3_0_recall = sum(recall_scores)/len(recall_scores)
  B3_0_F1 = sum(F1_scores)/len(F1_scores)

  return B3_0_precision, B3_0_recall, B3_0_F1

In [34]:
import pandas as pd

# Dummy GAP DataFrame
gap_df = pd.DataFrame({
    'ID': ['test-1', 'test-2'],
    'Pronoun': ['he', 'she'],
    'A': ['John', 'Anna'],
    'B': ['Mike', 'Emily']
})

# Dummy Predictions DataFrame
predictions_df = pd.DataFrame({
    'ID': ['test-1', 'test-2'],
    'A-coref': [False, True],
    'B-coref': [True, False]
})

# Merge the two DataFrames on the "ID" column
merged_df = pd.merge(gap_df, predictions_df, on='ID')


In [93]:
def generate_clusters_and_mentions(merged_df):
    # lists to store true and predicted clusters, and mentions for each sentence
    true_clusters_list = []
    pred_clusters_list = []
    mentions_list = []
    gender_list = []
    # Iterate through the merged DataFrame to generate clusters and mentions
    for _, row in merged_df.iterrows():
        pronoun = row['Pronoun']
        option_a = row['A']
        option_b = row['B']
        a_coref = row['A-coref']
        b_coref = row['B-coref']

        # mentions for this instance
        mentions = [pronoun, option_a, option_b]

        # true clusters 
        true_cluster = [[pronoun, option_a], [option_b]]
        gendered_pronouns = row['Pronoun_old'] if 'Pronoun_old' in merged_df.columns else row['Pronoun']
        gendered_pronouns = gendered_pronouns.lower()
        if gendered_pronouns == 'he' or gendered_pronouns == 'his' or gendered_pronouns == 'him':
            gender_list.append('M')
        elif gendered_pronouns == 'she' or gendered_pronouns == 'her' or gendered_pronouns == 'hers':
             gender_list.append('F')
        # pred clusters based on model output
        pred_cluster = []
        if a_coref:
            pred_cluster.append([pronoun, option_a])
        elif b_coref:
            pred_cluster.append([pronoun, option_b])
        else:
            pred_cluster.append([pronoun])

        true_clusters_list.append(true_cluster)
        pred_clusters_list.append(pred_cluster)
        mentions_list.append(mentions)

    return true_clusters_list, pred_clusters_list, mentions_list, gender_list


true_clusters_list, pred_clusters_list, mentions_list, gender_list = generate_clusters_and_mentions(merged_df)

# run B3_0 function for each sentence
for mentions, true_clusters, pred_clusters in zip(mentions_list, true_clusters_list, pred_clusters_list):
    precision_scores, recall_scores, f1_scores = B3_0(mentions, true_clusters, pred_clusters)
    print(f"For mentions {mentions}:")
    print(f"Precision: {precision_scores}, Recall: {recall_scores}, F1: {f1_scores}")


For mentions ['their', 'Bob Suter', 'Dehner']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['them', 'Alonso', 'Alfredo Di St*fano']:
Precision: [1.0, 1.0, 0.0], Recall: [0.5, 0.5, 0.0], F1: [0.6666666666666666, 0.6666666666666666, 0.0]
For mentions ['they', 'Ali Aladhadh', 'Saddam']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['their', 'Alliata', 'Pisciotta']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['their', 'Eddie', 'Rock Reilly']:
Precision: [1.0, 1.0, 0.0], Recall: [0.5, 0.5, 0.0], F1: [0.6666666666666666, 0.6666666666666666, 0.0]
For mentions ['they', 'Jewel Staite', 'Keller']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['they', 'Allison', 'Grace Smythe']:
Precision: [1.0, 1.0, 0.0], Recall: [0.5, 0.5, 0.0], F1: [0.6666666666666666, 0.6666666666666666, 0.0]
For mentions ['they', 'Sophie', 'Jeni']:
Precision: [

In [88]:
mentions

['they', 'Vicky Austin', "Polly O'Keefe"]

In [36]:
B3_0(mentions, true_clusters, pred_clusters)

([1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0])

In [135]:
gap_file = "gap-test.tsv"
predictions_file = "predictions-test.tsv"
# gap_file = "gap-test-gn.tsv"
# predictions_file = "predictions-test-gn.tsv"

In [136]:

# Reading GAP DataFrame from a TSV file
gap_df = pd.read_csv(gap_file, sep='\t')
gap_df.rename(columns={'A-coref': 'gold_A-coref', 'B-coref': 'gold_B-coref'},inplace=True)# Reading Predictions DataFrame from a TSV file
predictions_df = pd.read_csv(predictions_file, sep='\t')

# Merge the two DataFrames on the "ID" column
merged_df = pd.merge(gap_df, predictions_df, on='ID')

# Display the first few rows to check
merged_df.head(1)

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,gold_A-coref,B,B-offset,gold_B-coref,URL,A-coref,B-coref
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner,False,True


In [137]:
gap_df.head(1)

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,gold_A-coref,B,B-offset,gold_B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner


In [138]:
true_clusters_list, pred_clusters_list, mentions_list, gender_list = generate_clusters_and_mentions(merged_df)
# B3_0(mentions_list, true_clusters_list, pred_clusters_list)

In [139]:
merged_df

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,gold_A-coref,B,B-offset,gold_B-coref,URL,A-coref,B-coref
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner,False,True
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso,True,False
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh,True,False
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta,False,True
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,test-1996,"The sole exception was Wimbledon, where she pl...",She,479,Goolagong Cawley,400,True,Peggy Michel,432,False,http://en.wikipedia.org/wiki/Evonne_Goolagong_...,True,False
1996,test-1997,"According to news reports, both Moore and Fily...",her,338,Esther Sheryl Wood,263,True,Barbara Morgan,404,False,http://en.wikipedia.org/wiki/Hastings_Arthur_Wise,True,False
1997,test-1998,"In June 2009, due to the popularity of the Sab...",She,328,Kayla,364,True,Natasha Henstridge,412,False,http://en.wikipedia.org/wiki/Raya_Meddine,False,False
1998,test-1999,She was delivered to the Norwegian passenger s...,she,305,Irma,255,True,Bergen,274,False,http://en.wikipedia.org/wiki/SS_Irma_(1905),True,False


In [140]:
total_precision_scores = []
total_recall_scores = []
total_f1_scores = []
male_precision_scores = []
male_recall_scores = []
male_f1_scores = []

female_precision_scores = []
female_recall_scores = []
female_f1_scores = []

for mentions, true_clusters, pred_clusters, gender in zip(mentions_list, true_clusters_list, pred_clusters_list, gender_list):
    # print(f"Mentions {mentions}, true clusters {true_clusters}, predicted clusters {pred_clusters}")
    try:
        precision_scores, recall_scores, f1_scores = B3_0(mentions, true_clusters, pred_clusters)
        total_precision_scores += precision_scores
        total_recall_scores += recall_scores
        total_f1_scores += f1_scores

        if gender == 'M':
            male_precision_scores.extend(precision_scores)
            male_recall_scores.extend(recall_scores)
            male_f1_scores.extend(f1_scores)
        elif gender == 'F':
            female_precision_scores.extend(precision_scores)
            female_recall_scores.extend(recall_scores)
            female_f1_scores.extend(f1_scores)
    except:
        print(f"Mentions {mentions}, true clusters {true_clusters}, predicted clusters {pred_clusters}")

In [141]:
global_precision, global_recall, global_f1 = global_B3_0(total_precision_scores, total_recall_scores, total_f1_scores)

# format the results 
results_str = f"{gap_file} Global B3 Scores:\n"
results_str += f"Precision: {global_precision:.2f}\n"
results_str += f"Recall: {global_recall:.2f}\n"
results_str += f"F1 Score: {global_f1:.2f}"

print(results_str)

# save results to a text file
with open(gap_file+"_global_B3_scores.txt", "w") as f:
    f.write(results_str)

gap-test.tsv Global B3 Scores:
Precision: 0.60
Recall: 0.61
F1 Score: 0.59


In [142]:

# global scores
global_male_precision, global_male_recall, global_male_f1 = global_B3_0(male_precision_scores, male_recall_scores, male_f1_scores)
global_female_precision, global_female_recall, global_female_f1 = global_B3_0(female_precision_scores, female_recall_scores, female_f1_scores)

#F/M ratio for each metric
precision_ratio = global_female_precision / global_male_precision
recall_ratio = global_female_recall / global_male_recall
f1_ratio = global_female_f1 / global_male_f1

results_str = f"Global B3 Scores:\n"
results_str += f"Male Precision: {global_male_precision:.2f}, Female Precision: {global_female_precision:.2f}, Precision Ratio (F/M): {precision_ratio:.2f}\n"
results_str += f"Male Recall: {global_male_recall:.2f}, Female Recall: {global_female_recall:.2f}, Recall Ratio (F/M): {recall_ratio:.2f}\n"
results_str += f"Male F1: {global_male_f1:.2f}, Female F1: {global_female_f1:.2f}, F1 Ratio (F/M): {f1_ratio:.2f}"

print(results_str)

Global B3 Scores:
Male Precision: 0.60, Female Precision: 0.60, Precision Ratio (F/M): 1.00
Male Recall: 0.61, Female Recall: 0.61, Recall Ratio (F/M): 1.01
Male F1: 0.58, Female F1: 0.59, F1 Ratio (F/M): 1.01
