In [1]:
import os
import pandas as pd
from allennlp.predictors.predictor import Predictor
from gap_scorer import run_scorer  
from tqdm import tqdm
import torch
# Check for CUDA
if torch.cuda.is_available():
    print("CUDA available, predictions will be faster.")
else:
    print("CUDA not available, predictions may be slower.")
    

CUDA available, predictions will be faster.


In [2]:
# Initialize the SpanBERT predictor
import os
import urllib.request

# Path to the file
path_spanbert = 'spanbert_local/'
filename = "coref-spanbert-large-2021.03.10.tar.gz"


save_path = os.path.join(path_spanbert, filename)
if os.path.exists(save_path):
    predictor = Predictor.from_path(save_path,cuda_device=0)
else:
        # Create directory if it doesn't exist
    if not os.path.exists(path_spanbert):
        os.makedirs(path_spanbert)
        # Full path to save the file

    # Download the file
    urllib.request.urlretrieve("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz", save_path)
    predictor = Predictor.from_path(save_path,cuda_device=0)

    # predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz",cuda_device=0) #use this if urllib didnt work

error loading _jsonnet (this is expected on Windows), treating C:\Users\pc-bae-2\AppData\Local\Temp\tmp5catso87\config.json as plain json
Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _C._set_default_tensor_type(t)


In [15]:

# Read GAP data
main_path = "../gap-coreference-master/gap-coreference-master"  
main_path = "." 
gap_file =  "gap-test.tsv"
gap_development_path = os.path.join(main_path,gap_file)
gap_df = pd.read_csv(gap_development_path, delimiter='\t')
print(gap_df[:5])


             ID                                               Text Pronoun  \
0  validation-1  He admitted making four trips to China and pla...     him   
1  validation-2  Kathleen Nott was born in Camberwell, London. ...     She   
2  validation-3  When she returns to her hotel room, a Liberian...     his   
3  validation-4  On 19 March 2007, during a campaign appearance...      he   
4  validation-5  By this time, Karen Blixen had separated from ...     she   

   Pronoun-offset                   A  A-offset  A-coref              B  \
0             256  Jose de Venecia Jr       208    False         Abalos   
1             185               Ellen       110    False       Kathleen   
2             435     Jason Scott Lee       383    False          Danny   
3             333           Reucassel       300     True         Debnam   
4             427        Finch Hatton       290    False  Beryl Markham   

   B-offset  B-coref                                                URL  
0     

In [16]:
# Sample output from AllenNLP coref model
example = gap_df.iloc[2]
text = example['Text']
print(example)
result = predictor.predict(document=text)
print(result['clusters'])
# Initialize an empty list with placeholders
text_visual = ['_'] * len(result['document'])

# Fill in the placeholders with tokens
for i, token in enumerate(result['document']):
    text_visual[i] = token

# Add brackets for coref clusters
for cluster in result['clusters']:
    for start, end in cluster:
        text_visual[start] = '[' + text_visual[start]
        text_visual[end] = text_visual[end] + ']'

# Combine into a string
text_visual_str = ' '.join(text_visual)
print(text)
print(text_visual_str)


ID                                                     validation-3
Text              When she returns to her hotel room, a Liberian...
Pronoun                                                         his
Pronoun-offset                                                  435
A                                                   Jason Scott Lee
A-offset                                                        383
A-coref                                                       False
B                                                             Danny
B-offset                                                        406
B-coref                                                        True
URL               http://en.wikipedia.org/wiki/Hawaii_Five-0_(20...
Name: 2, dtype: object
[[[1, 1], [4, 4], [16, 16], [41, 41], [45, 45]], [[48, 49], [59, 60]], [[32, 37], [65, 65]], [[19, 25], [68, 70]], [[85, 85], [90, 90]], [[52, 53], [95, 95]]]
When she returns to her hotel room, a Liberian man (Tony Todd) forces 

In [17]:
# Define the function to extract words that correspond to each cluster
def extract_cluster_words(tokenized_document, clusters):
    total_clusters = []
    for cluster in clusters:
        txt_cluster = []
        for start, end in cluster:
            cluster_tokens = tokenized_document[start:end+1]
            txt_cluster.append(" ".join(cluster_tokens))
        total_clusters.append(txt_cluster)
    return total_clusters

# Example 
tokenized_document = ['It', 'was', 'reported', 'that', 'John', 'and', 'Jane', 'were', 'together', '.', 'He', 'said', 'it', 'was', 'true', '.']
clusters = [[[4, 4], [10, 10]], [[5, 5], [12, 12]]]
print(result['clusters'])
print(extract_cluster_words(result['document'], result['clusters']))


[[[1, 1], [4, 4], [16, 16], [41, 41], [45, 45]], [[48, 49], [59, 60]], [[32, 37], [65, 65]], [[19, 25], [68, 70]], [[85, 85], [90, 90]], [[52, 53], [95, 95]]]
[['she', 'her', 'her', 'She', 'she'], ['the flight', 'the plane'], ['fellow fight attendant and friend Angela', 'Angela'], ['$ 20 million worth of conflict diamonds', 'the confiscated diamonds'], ['Danny', 'his'], ['the team', 'Five-0']]


In [18]:
# Create batches - not working ngl
batch_size = 16 
num_batches = len(gap_df) // batch_size + (1 if len(gap_df) % batch_size != 0 else 0)

predictions = []
num_rows = len(gap_df)
num_batches = (num_rows + batch_size - 1) // batch_size
for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_rows)
        batch = gap_df.iloc[start_idx:end_idx]
        for _, row in  batch.iterrows():
            # print(row)
            text = row['Text']
            result = predictor.predict(document=text)
            clusters = result['clusters']
            tokens = result['document']

            # Initialize coreference indicators for A and B to 0
            a_coref, b_coref = 0, 0

            # Find the cluster containing the pronoun
            pronoun_offset = row['Pronoun-offset']
            pronoun_length = len(row['Pronoun'])
            pronoun_cluster = None

            # Calculate the character offsets for each token
            char_offsets = []
            offset = 0
            token_idx = 0
            
            tokens_dict = {}
            for i, char in enumerate(text):
                if char == ' ':
                    continue
                if text[i:i+len(tokens[token_idx])] == tokens[token_idx]:
                    start_offset = i
                    end_offset = i + len(tokens[token_idx]) - 1
                    char_offsets.append((start_offset, end_offset))
                    tokens_dict[ (start_offset, end_offset)] =tokens[token_idx]

                    i = end_offset
                    token_idx += 1
                    if token_idx >= len(tokens):
                        break
                    
            # print(tokens_dict)
            # print('clusters',clusters)
            # clusters = sorted(clusters, key=len) #Sort base on how cluster size, smaller is more important!
            
            # print('sorted clusters',clusters)
            pronoun_clusters = []
            for cluster in clusters:
                for start, end in cluster:
                    # print(char_offsets[start])
                    try:
                        start_offset, end_offset = char_offsets[start]
                        if start_offset <= pronoun_offset and end_offset >= (pronoun_offset + pronoun_length - 1):
                            pronoun_cluster = cluster
                            # print("Pronoun cluster:",cluster)
                            pronoun_clusters.append(pronoun_cluster)
                        # break
                    except Exception as e:
                        print("Warning - Error with selecting clusters, gonna ignore it and continue but be ware")
                        print(e)
                        continue
                # if pronoun_cluster:
                #     break
            # print(extract_cluster_words(tokens,[pronoun_cluster]))
            # Check if 'A' or 'B' is in the same cluster as the pronoun
            for pronoun_cluster in pronoun_clusters:
                a_start, a_end = row['A-offset'], row['A-offset'] + len(row['A']) - 1
                b_start, b_end = row['B-offset'], row['B-offset'] + len(row['B']) - 1
                # print("A goal:",a_start,a_end)
                # print("B goal:",b_start,b_end)
                for start, end in pronoun_cluster:
                    # print("start,end",start,end)
                    start_offset, _ = char_offsets[start]
                    _, end_offset =char_offsets[end]
                    # print("start,end char:", start_offset, end_offset)
                    if start_offset <= a_start and end_offset >= a_end and b_coref == 0:
                        a_coref = 1
                        break
                    if start_offset <= b_start and end_offset >= b_end and a_coref == 0:
                        b_coref = 1
                        break

            predictions.append({
                'ID': row['ID'],
                'A-coref': a_coref,
                'B-coref': b_coref
            })
            # print(predictions)
            # break


Processing batches: 100%|██████████| 29/29 [00:29<00:00,  1.02s/it]


## Create Scores
Here we compute the f1 scores using gap_scorer.py 

In [19]:
# Predictions have to be in shape: \\
#        ID  A-coref  B-coref \\
# 0  test-1        0        1 \\
# 1  test-2        1        0 \\

In [20]:
import pandas as pd
golden_path = gap_file #this should match with the file we used to generate predictions
# golden_path = 'gap-development.tsv'
# golden_path = 'gap-validation.tsv'
# golden_path = 'gap-test.tsv' 
predictions_path = golden_path.replace('gap','predictions')
# predictions_path = 'predictions-test.tsv'


In [21]:
# Convert the predictions dictionary to a DataFrame
predictions_df = pd.DataFrame(predictions)
print(predictions_df[:5])
# Convert 1 to True and 0 to False in the A-coref and B-coref columns
predictions_df['A-coref'] = predictions_df['A-coref'].astype(bool)
predictions_df['B-coref'] = predictions_df['B-coref'].astype(bool)
# Save the DataFrame to a TSV file
predictions_df.to_csv(predictions_path, sep='\t', index=False)
# Gold_annotations is a list of ground truth annotations from GAP
# Run the scorer

             ID  A-coref  B-coref
0  validation-1        1        0
1  validation-2        0        1
2  validation-3        0        1
3  validation-4        1        0
4  validation-5        0        1


In [22]:
from gap_scorer import run_scorer  
scores = run_scorer(golden_path, predictions_path)

print(scores)

with open("scores_"+predictions_path.replace('.tsv',".txt"), "w") as f:
    # json.dump(scores, f, indent=4)
    f.writelines(scores)


Unexpected label! A-coref
Unexpected label! B-coref
Overall recall: 82.4 precision: 86.4 f1: 84.3
		tp 323	fp 51
		fn 69	tn 465
Masculine recall: 78.7 precision: 81.3 f1: 80.0
		tp 148	fp 34
		fn 40	tn 232
Feminine recall: 85.8 precision: 91.1 f1: 88.4
		tp 175	fp 17
		fn 29	tn 233
Bias (F/M): 1.10



## Debugging stuff

In [23]:
clu = []
tks = result['document']
total_clusters = []
for cluster in result['clusters']:
    print('New Cluster')
    
    txt_cluster = []
    for start, end in cluster:
        print(start,end)
        print(tks[start:end+1])
        txt_cluster.append(" ".join(tks[start:end+1]) )
    total_clusters.append(txt_cluster)
print(total_clusters)

    

New Cluster
2 3
['Vassey', "'s"]
8 8
['him']
33 33
['Vassey']
36 36
['his']
39 39
['Vassey']
45 45
['Vassey']
53 53
['him']
63 64
['Vassey', "'s"]
New Cluster
18 18
['Denton']
26 26
['his']
31 31
['he']
49 49
['Denton']
New Cluster
12 12
['Coakley']
26 29
['his', 'ex', '-', 'wife']
[["Vassey 's", 'him', 'Vassey', 'his', 'Vassey', 'Vassey', 'him', "Vassey 's"], ['Denton', 'his', 'he', 'Denton'], ['Coakley', 'his ex - wife']]


In [24]:
# Initialize variables
char_offsets = []
offset = 0
token_idx = 0

# Iterate over the original text to calculate character-based offsets
for i, char in enumerate(text):
    # Skip spaces
    if char == ' ':
        continue
    
    # Check if the current character matches the start of the next token
    if text[i:i+len(result['document'][token_idx])] == result['document'][token_idx]:
        start_offset = i
        end_offset = i + len(result['document'][token_idx]) - 1
        char_offsets.append((start_offset, end_offset))
        
        # Move the pointer i to the end of the current token
        i = end_offset
        
        # Move to the next token
        token_idx += 1
        
        # Exit the loop if we've found all tokens
        if token_idx >= len(result['document']):
            break

# Debugging: Print each token next to its offset
count = 0
for (start, end), token in zip(char_offsets, result['document']):
    print(f"Token: {token}, Start Offset: {start}, End Offset: {end}, Count {count}")
    count +=1
    


Token: Pleasant, Start Offset: 0, End Offset: 7, Count 0
Token: explains, Start Offset: 9, End Offset: 16, Count 1
Token: Vassey, Start Offset: 18, End Offset: 23, Count 2
Token: 's, Start Offset: 24, End Offset: 25, Count 3
Token: guilty, Start Offset: 27, End Offset: 32, Count 4
Token: conscience, Start Offset: 34, End Offset: 43, Count 5
Token: may, Start Offset: 45, End Offset: 47, Count 6
Token: lead, Start Offset: 49, End Offset: 52, Count 7
Token: him, Start Offset: 54, End Offset: 56, Count 8
Token: to, Start Offset: 58, End Offset: 59, Count 9
Token: confess, Start Offset: 61, End Offset: 67, Count 10
Token: to, Start Offset: 69, End Offset: 70, Count 11
Token: Coakley, Start Offset: 72, End Offset: 78, Count 12
Token: ., Start Offset: 79, End Offset: 79, Count 13
Token: Pleasant, Start Offset: 81, End Offset: 88, Count 14
Token: promises, Start Offset: 90, End Offset: 97, Count 15
Token: to, Start Offset: 99, End Offset: 100, Count 16
Token: help, Start Offset: 102, End Offse

In [25]:
gold_annotations = []

for index, row in gap_df.iterrows():
    gold_annotation = {
        'ID': row['ID'],
        'A-coref': True if row['A-coref'] == True else False,
        'B-coref': True if row['B-coref'] == True else False,
        # Add other fields as needed
    }
    gold_annotations.append(gold_annotation)
gold_annotations[:5]

[{'ID': 'validation-1', 'A-coref': False, 'B-coref': False},
 {'ID': 'validation-2', 'A-coref': False, 'B-coref': True},
 {'ID': 'validation-3', 'A-coref': False, 'B-coref': True},
 {'ID': 'validation-4', 'A-coref': True, 'B-coref': False},
 {'ID': 'validation-5', 'A-coref': False, 'B-coref': True}]