In [1]:
import re
from collections import Counter

In [12]:
def B3_0(mentions, true_clusters, pred_clusters):
  '''
  Calculates precision, recall, and optionally F1 for the  B3(0) metric,
  based on formulation in https://aclanthology.org/W10-4305.pdf

  returns precision, recall and f1 as lists for the input sentence
  '''

  precision_scores = []
  recall_scores = []
  f1_scores = []

  for mention in mentions:
    precision = 0
    recall = 0

    # finding key and response clusters to look at (first cluster to come up that contains current mention)
    mention_key_cluster = None
    for cluster in true_clusters:
      if mention in cluster:
        mention_key_cluster = cluster
        break
    assert mention_key_cluster, "At least one true cluster must contain mention!"

    mention_pred_cluster = None
    for cluster in pred_clusters:
      if mention in cluster:
        mention_response_cluster = cluster
        break

    intersection_key_response = list((Counter(mention_key_cluster) & Counter(mention_response_cluster)).elements())
    overlap_count = len(intersection_key_response)

    # response cluster could be empty if mention was not predicted for any cluster (twinless mention); in this case precision and recall both at 0
    if mention_response_cluster:
      precision = overlap_count / len(mention_response_cluster)
      recall = overlap_count / len(mention_key_cluster)

    precision_scores.append(precision)
    recall_scores.append(recall)
# Check for zero division
    if precision + recall != 0:
        f1_scores.append((2 * precision * recall) / (precision + recall))
    else:
        f1_scores.append(0.0)
  return precision_scores, recall_scores, f1_scores
def global_B3_0(precision_scores, recall_scores, F1_scores):
  '''
  Calculates global precision, recall and F1 scores based on lists of
  individual B3_0 precision/recall/F1 scores per mention
  '''

  B3_0_precision = sum(precision_scores)/len(precision_scores)
  B3_0_recall = sum(recall_scores)/len(recall_scores)
  B3_0_F1 = sum(F1_scores)/len(F1_scores)

  return B3_0_precision, B3_0_recall, B3_0_F1

In [34]:
import pandas as pd

# Dummy GAP DataFrame
gap_df = pd.DataFrame({
    'ID': ['test-1', 'test-2'],
    'Pronoun': ['he', 'she'],
    'A': ['John', 'Anna'],
    'B': ['Mike', 'Emily']
})

# Dummy Predictions DataFrame
predictions_df = pd.DataFrame({
    'ID': ['test-1', 'test-2'],
    'A-coref': [False, True],
    'B-coref': [True, False]
})

# Merge the two DataFrames on the "ID" column
merged_df = pd.merge(gap_df, predictions_df, on='ID')


In [53]:
def generate_clusters_and_mentions(merged_df):
    # Initialize lists to store true and predicted clusters, and mentions for each sentence
    true_clusters_list = []
    pred_clusters_list = []
    mentions_list = []

    # Iterate through the merged DataFrame to generate clusters and mentions
    for _, row in merged_df.iterrows():
        pronoun = row['Pronoun']
        option_a = row['A']
        option_b = row['B']
        a_coref = row['A-coref']
        b_coref = row['B-coref']

        # Mentions for this instance
        mentions = [pronoun, option_a, option_b]

        # True clusters (for demonstration, assuming pronoun co-refers with both options)
        true_cluster = [[pronoun, option_a], [option_b]]

        # Predicted clusters based on model output
        pred_cluster = []
        if a_coref:
            pred_cluster.append([pronoun, option_a])
        elif b_coref:
            pred_cluster.append([pronoun, option_b])
        else:
            pred_cluster.append([pronoun])

        true_clusters_list.append(true_cluster)
        pred_clusters_list.append(pred_cluster)
        mentions_list.append(mentions)

    return true_clusters_list, pred_clusters_list, mentions_list

# Use the function
true_clusters_list, pred_clusters_list, mentions_list = generate_clusters_and_mentions(merged_df)

# Now you can run your B3_0 function for each sentence
for mentions, true_clusters, pred_clusters in zip(mentions_list, true_clusters_list, pred_clusters_list):
    precision_scores, recall_scores, f1_scores = B3_0(mentions, true_clusters, pred_clusters)
    print(f"For mentions {mentions}:")
    print(f"Precision: {precision_scores}, Recall: {recall_scores}, F1: {f1_scores}")


For mentions ['His', 'Bob Suter', 'Dehner']:
Precision: [0.5, 0.5, 0.5], Recall: [0.5, 0.5, 1.0], F1: [0.5, 0.5, 0.6666666666666666]
For mentions ['him', 'Alonso', 'Alfredo Di St*fano']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['He', 'Ali Aladhadh', 'Saddam']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['his', 'Alliata', 'Pisciotta']:
Precision: [0.5, 0.5, 0.5], Recall: [0.5, 0.5, 1.0], F1: [0.5, 0.5, 0.6666666666666666]
For mentions ['his', 'Eddie', 'Rock Reilly']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['her', 'Jewel Staite', 'Keller']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['She', 'Allison', 'Grace Smythe']:
Precision: [1.0, 1.0, 0.0], Recall: [1.0, 1.0, 0.0], F1: [1.0, 1.0, 0.0]
For mentions ['her', 'Sophie', 'Jeni']:
Precision: [0.5, 0.5, 0.5], Recall: [0.5, 0.5, 1.0], F1: [0.5, 0.5, 0.66666666666666

In [36]:
B3_0(mentions, true_clusters, pred_clusters)

([1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0])

In [37]:
gap_file = "gap-test.tsv"
predictions_file = "predictions-test.tsv"

In [38]:
gap_df.head(1)

Unnamed: 0,ID,Pronoun,A,B
0,test-1,he,John,Mike


In [54]:

# Reading GAP DataFrame from a TSV file
gap_df = pd.read_csv(gap_file, sep='\t')
gap_df.rename(columns={'A-coref': 'gold_A-coref', 'B-coref': 'gold_B-coref'},inplace=True)# Reading Predictions DataFrame from a TSV file
predictions_df = pd.read_csv(predictions_file, sep='\t')

# Merge the two DataFrames on the "ID" column
merged_df = pd.merge(gap_df, predictions_df, on='ID')

# Display the first few rows to check
merged_df.head(1)

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,gold_A-coref,B,B-offset,gold_B-coref,URL,A-coref,B-coref
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner,False,True


In [65]:
true_clusters_list, pred_clusters_list, mentions_list = generate_clusters_and_mentions(merged_df)
# B3_0(mentions_list, true_clusters_list, pred_clusters_list)

In [66]:
total_precision_scores = []
total_recall_scores = []
total_f1_scores = []
# Now you can run your B3_0 function for each sentence
for mentions, true_clusters, pred_clusters in zip(mentions_list, true_clusters_list, pred_clusters_list):
    # print(f"Mentions {mentions}, true clusters {true_clusters}, predicted clusters {pred_clusters}")
    try:
        precision_scores, recall_scores, f1_scores = B3_0(mentions, true_clusters, pred_clusters)
        total_precision_scores += precision_scores
        total_recall_scores += recall_scores
        total_f1_scores += f1_scores
    except:
        print(f"Mentions {mentions}, true clusters {true_clusters}, predicted clusters {pred_clusters}")

In [72]:
global_precision, global_recall, global_f1 = global_B3_0(total_precision_scores, total_recall_scores, total_f1_scores)

# Format the results as a string
results_str = f"{gap_file} Global B3 Scores:\n"
results_str += f"Precision: {global_precision:.2f}\n"
results_str += f"Recall: {global_recall:.2f}\n"
results_str += f"F1 Score: {global_f1:.2f}"

# Print and share the results
print(results_str)

# If you want to save the results to a text file
with open(gap_file+"_global_B3_scores.txt", "w") as f:
    f.write(results_str)

gap-test.tsv Global B3 Scores:
Precision: 0.60
Recall: 0.61
F1 Score: 0.59
