# SpanBert GAP Evaluation

- We load the spanbert model or download it if its missing. 
- We do data processing for GAP 
- Generate predictions for GAP data 
- Use official scorer from GAP repository to compute F1 and accuracy. 

In [17]:
import os
import pandas as pd
from allennlp.predictors.predictor import Predictor
from gap_scorer import run_scorer  
from tqdm import tqdm
import torch
# Check for CUDA
if torch.cuda.is_available():
    print("CUDA available, predictions will be faster.")
else:
    print("CUDA not available, predictions may be slower.")
    

CUDA available, predictions will be faster.


### Load model or download if needed

In [18]:
# Initialize the SpanBERT predictor
import os
import urllib.request

# Path to the file
path_spanbert = 'spanbert_local/'
filename = "coref-spanbert-large-2021.03.10.tar.gz"


save_path = os.path.join(path_spanbert, filename)
if os.path.exists(save_path):
    predictor = Predictor.from_path(save_path,cuda_device=0)
else:
        # Create directory if it doesn't exist
    if not os.path.exists(path_spanbert):
        os.makedirs(path_spanbert)
        # Full path to save the file

    # Download the file
    urllib.request.urlretrieve("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz", save_path)
    predictor = Predictor.from_path(save_path,cuda_device=0)

    # predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz",cuda_device=0) #use this if urllib didnt work

error loading _jsonnet (this is expected on Windows), treating C:\Users\pc-bae-2\AppData\Local\Temp\tmpy_0x34aq\config.json as plain json


### Read data
Data is in the same folder as notebook so should work as is

In [19]:

# Read GAP data
main_path = "../gap-coreference-master/gap-coreference-master"  
main_path = "." 
gap_file =  "gap-test.tsv"
# gap_file = "gap-test-gn.tsv"
gap_development_path = os.path.join(main_path,gap_file)
gap_df = pd.read_csv(gap_development_path, delimiter='\t')
print(gap_df[:5])


       ID                                               Text Pronoun  \
0  test-1  Upon their acceptance into the Kontinental Hoc...     His   
1  test-2  Between the years 1979-1981, River won four lo...     him   
2  test-3  Though his emigration from the country has aff...      He   
3  test-4  At the trial, Pisciotta said: ``Those who have...     his   
4  test-5  It is about a pair of United States Navy shore...     his   

   Pronoun-offset             A  A-offset  A-coref                   B  \
0             383     Bob Suter       352    False              Dehner   
1             430        Alonso       353     True  Alfredo Di St*fano   
2             312  Ali Aladhadh       256     True              Saddam   
3             526       Alliata       377    False           Pisciotta   
4             406         Eddie       421     True         Rock Reilly   

   B-offset  B-coref                                             URL  
0       366     True      http://en.wikipedia.org/w

### Model output example
We compute an example and process it to visualize results

In [20]:
# Sample output from AllenNLP coref model
example = gap_df.iloc[2]
text = example['Text']
print(example)
result = predictor.predict(document=text)
print(result['clusters'])
# Initialize an empty list with placeholders
text_visual = ['_'] * len(result['document'])

# Fill in the placeholders with tokens
for i, token in enumerate(result['document']):
    text_visual[i] = token

# Add brackets for coref clusters
for cluster in result['clusters']:
    for start, end in cluster:
        text_visual[start] = '[' + text_visual[start]
        text_visual[end] = text_visual[end] + ']'

# Combine into a string
text_visual_str = ' '.join(text_visual)
print(text)
print(text_visual_str)


ID                                                           test-3
Text              Though his emigration from the country has aff...
Pronoun                                                          He
Pronoun-offset                                                  312
A                                                      Ali Aladhadh
A-offset                                                        256
A-coref                                                        True
B                                                            Saddam
B-offset                                                        295
B-coref                                                       False
URL                           http://en.wikipedia.org/wiki/Aladhadh
Name: 2, dtype: object
[[[1, 1], [8, 8], [12, 12]], [[19, 20], [39, 40]], [[32, 34], [42, 47], [49, 50], [61, 61], [65, 65], [69, 69]]]
Though his emigration from the country has affected his leadership status, Kamel is still a respected elder of the c

In [21]:
# Extract words that correspond to each cluster
def extract_cluster_words(tokenized_document, clusters):
    total_clusters = []
    for cluster in clusters:
        txt_cluster = []
        for start, end in cluster:
            cluster_tokens = tokenized_document[start:end+1]
            txt_cluster.append(" ".join(cluster_tokens))
        total_clusters.append(txt_cluster)
    return total_clusters

# Example 
tokenized_document = ['It', 'was', 'reported', 'that', 'John', 'and', 'Jane', 'were', 'together', '.', 'He', 'said', 'it', 'was', 'true', '.']
clusters = [[[4, 4], [10, 10]], [[5, 5], [12, 12]]]
print(result['clusters'])
print(extract_cluster_words(result['document'], result['clusters']))


[[[1, 1], [8, 8], [12, 12]], [[19, 20], [39, 40]], [[32, 34], [42, 47], [49, 50], [61, 61], [65, 65], [69, 69]]]
[['his', 'his', 'Kamel'], ['the clan', 'the clan'], ['Dr. Ali Aladhadh', "A contributor to Iraq 's liberation", 'Ali Aladhadh', 'He', 'his', 'his']]


### Evaluation loop
Here is where we loop through the data and generate the predictions

In [22]:
# Create batches - not working ngl
batch_size = 16 
num_batches = len(gap_df) // batch_size + (1 if len(gap_df) % batch_size != 0 else 0)

predictions = []
num_rows = len(gap_df)
num_batches = (num_rows + batch_size - 1) // batch_size
for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_rows)
        batch = gap_df.iloc[start_idx:end_idx]
        for _, row in  batch.iterrows():
            # print(row)
            text = row['Text']
            result = predictor.predict(document=text)
            clusters = result['clusters']
            tokens = result['document']

            # Initialize coreference indicators for A and B to 0
            a_coref, b_coref = 0, 0

            # Find the cluster containing the pronoun
            pronoun_offset = row['Pronoun-offset']
            pronoun_length = len(row['Pronoun'])
            pronoun_cluster = None

            # Calculate the character offsets for each token
            char_offsets = []
            offset = 0
            token_idx = 0
            
            tokens_dict = {}
            for i, char in enumerate(text):
                if char == ' ':
                    continue
                if text[i:i+len(tokens[token_idx])] == tokens[token_idx]:
                    start_offset = i
                    end_offset = i + len(tokens[token_idx]) - 1
                    char_offsets.append((start_offset, end_offset))
                    tokens_dict[ (start_offset, end_offset)] =tokens[token_idx]

                    i = end_offset
                    token_idx += 1
                    if token_idx >= len(tokens):
                        break
                    
            # print(tokens_dict)
            # print('clusters',clusters)
            # clusters = sorted(clusters, key=len) #Sort base on how cluster size, smaller is more important!
            
            # print('sorted clusters',clusters)
            pronoun_clusters = []
            for cluster in clusters:
                for start, end in cluster:
                    # print(char_offsets[start])
                    try:
                        start_offset, end_offset = char_offsets[start]
                        if start_offset <= pronoun_offset and end_offset >= (pronoun_offset + pronoun_length - 1):
                            pronoun_cluster = cluster
                            # print("Pronoun cluster:",cluster)
                            pronoun_clusters.append(pronoun_cluster)
                        # break
                    except Exception as e:
                        print("Warning - Error with selecting clusters, gonna ignore it and continue but be ware")
                        print(e)
                        continue
                # if pronoun_cluster:
                #     break
            # print(extract_cluster_words(tokens,[pronoun_cluster]))
            # Check if 'A' or 'B' is in the same cluster as the pronoun
            for pronoun_cluster in pronoun_clusters:
                a_start, a_end = row['A-offset'], row['A-offset'] + len(row['A']) - 1
                b_start, b_end = row['B-offset'], row['B-offset'] + len(row['B']) - 1
                # print("A goal:",a_start,a_end)
                # print("B goal:",b_start,b_end)
                for start, end in pronoun_cluster:
                    # print("start,end",start,end)
                    start_offset, _ = char_offsets[start]
                    _, end_offset =char_offsets[end]
                    # print("start,end char:", start_offset, end_offset)
                    if start_offset <= a_start and end_offset >= a_end and b_coref == 0:
                        a_coref = 1
                        break
                    if start_offset <= b_start and end_offset >= b_end and a_coref == 0:
                        b_coref = 1
                        break

            predictions.append({
                'ID': row['ID'],
                'A-coref': a_coref,
                'B-coref': b_coref
            })
            # print(predictions)
            # break


Processing batches:  83%|████████▎ | 104/125 [01:51<00:21,  1.02s/it]

list index out of range
list index out of range


Processing batches: 100%|██████████| 125/125 [02:12<00:00,  1.06s/it]


## Create Scores
Here we compute the f1 scores using gap_scorer.py 

In [23]:
# Predictions have to be in shape: \\
#        ID  A-coref  B-coref \\
# 0  test-1        0        1 \\
# 1  test-2        1        0 \\

In [24]:
import pandas as pd
golden_path = gap_file #this should match with the file we used to generate predictions on top of notebook
# golden_path = 'gap-development.tsv'
# golden_path = 'gap-validation.tsv'
# golden_path = 'gap-test.tsv' 
predictions_path = golden_path.replace('gap','predictions')
# predictions_path = 'predictions-test.tsv'


In [25]:
# dictnary to df
predictions_df = pd.DataFrame(predictions)
print(predictions_df[:5])
# Convert 1 to True and 0 to False in the A-coref and B-coref columns
predictions_df['A-coref'] = predictions_df['A-coref'].astype(bool)
predictions_df['B-coref'] = predictions_df['B-coref'].astype(bool)
# DataFrame to a TSV file
predictions_df.to_csv(predictions_path, sep='\t', index=False)

       ID  A-coref  B-coref
0  test-1        0        1
1  test-2        1        0
2  test-3        1        0
3  test-4        0        1
4  test-5        1        0


In [26]:
from gap_scorer import run_scorer  
scores = run_scorer(golden_path, predictions_path)

print(scores)

with open("scores_"+predictions_path.replace('.tsv',".txt"), "w") as f:
    # json.dump(scores, f, indent=4)
    f.writelines(scores)


Unexpected label! A-coref
Unexpected label! B-coref
Overall recall: 85.4 precision: 90.8 f1: 88.0
		tp 1514	fp 154
		fn 259	tn 2073
Masculine recall: 86.4 precision: 93.3 f1: 89.7
		tp 768	fp 55
		fn 121	tn 1056
Feminine recall: 84.4 precision: 88.3 f1: 86.3
		tp 746	fp 99
		fn 138	tn 1017
Bias (F/M): 0.96



## Debugging stuff

In [27]:
clu = []
tks = result['document']
total_clusters = []
for cluster in result['clusters']:
    print('New Cluster')
    
    txt_cluster = []
    for start, end in cluster:
        print(start,end)
        print(tks[start:end+1])
        txt_cluster.append(" ".join(tks[start:end+1]) )
    total_clusters.append(txt_cluster)
print(total_clusters)

    

New Cluster
0 2
['Meg', 'and', 'Vicky']
14 14
['their']
New Cluster
0 0
['Meg']
24 24
['Meg']
New Cluster
2 2
['Vicky']
26 26
['Vicky']
39 40
['Vicky', 'Austin']
50 50
['her']
[['Meg and Vicky', 'their'], ['Meg', 'Meg'], ['Vicky', 'Vicky', 'Vicky Austin', 'her']]


In [28]:
# Initialize variables
char_offsets = []
offset = 0
token_idx = 0

# Iterate over the original text to calculate character-based offsets
for i, char in enumerate(text):
    # Skip spaces
    if char == ' ':
        continue
    
    # Check if the current character matches the start of the next token
    if text[i:i+len(result['document'][token_idx])] == result['document'][token_idx]:
        start_offset = i
        end_offset = i + len(result['document'][token_idx]) - 1
        char_offsets.append((start_offset, end_offset))
        
        # Move the pointer i to the end of the current token
        i = end_offset
        
        # Move to the next token
        token_idx += 1
        
        # Exit the loop if we've found all tokens
        if token_idx >= len(result['document']):
            break

# Debugging: Print each token next to its offset
count = 0
for (start, end), token in zip(char_offsets, result['document']):
    print(f"Token: {token}, Start Offset: {start}, End Offset: {end}, Count {count}")
    count +=1
    


Token: Meg, Start Offset: 0, End Offset: 2, Count 0
Token: and, Start Offset: 4, End Offset: 6, Count 1
Token: Vicky, Start Offset: 8, End Offset: 12, Count 2
Token: each, Start Offset: 14, End Offset: 17, Count 3
Token: have, Start Offset: 19, End Offset: 22, Count 4
Token: three, Start Offset: 24, End Offset: 28, Count 5
Token: siblings, Start Offset: 30, End Offset: 37, Count 6
Token: ,, Start Offset: 38, End Offset: 38, Count 7
Token: and, Start Offset: 40, End Offset: 42, Count 8
Token: have, Start Offset: 44, End Offset: 47, Count 9
Token: a, Start Offset: 45, End Offset: 45, Count 10
Token: closer, Start Offset: 51, End Offset: 56, Count 11
Token: relationship, Start Offset: 58, End Offset: 69, Count 12
Token: with, Start Offset: 71, End Offset: 74, Count 13
Token: their, Start Offset: 76, End Offset: 80, Count 14
Token: youngest, Start Offset: 82, End Offset: 89, Count 15
Token: brother, Start Offset: 91, End Offset: 97, Count 16
Token: than, Start Offset: 99, End Offset: 102, 

In [29]:
gold_annotations = []

for index, row in gap_df.iterrows():
    gold_annotation = {
        'ID': row['ID'],
        'A-coref': True if row['A-coref'] == True else False,
        'B-coref': True if row['B-coref'] == True else False,
        # Add other fields as needed
    }
    gold_annotations.append(gold_annotation)
gold_annotations[:5]

[{'ID': 'test-1', 'A-coref': False, 'B-coref': True},
 {'ID': 'test-2', 'A-coref': True, 'B-coref': False},
 {'ID': 'test-3', 'A-coref': True, 'B-coref': False},
 {'ID': 'test-4', 'A-coref': False, 'B-coref': True},
 {'ID': 'test-5', 'A-coref': True, 'B-coref': False}]