In [14]:
import pandas as pd
import json
files = ['gap-test.tsv', 'gap-validation.tsv', 'gap-development.tsv']

In [12]:
def convert_to_jsonl(file):    
    # Load the GAP dataset into a DataFrame
    gap_data = pd.read_csv(file, sep='\t')
    with open(file.replace('.tsv', '.jsonl'), 'w') as jsonl_file:
        for index, row in gap_data.iterrows():
            
            # Extract relevant information from the DataFrame
            text = row['Text']
            pronoun = row['Pronoun']
            pronoun_offset = row['Pronoun-offset']
            a = row['A']
            a_offset = row['A-offset']
            b = row['B']
            b_offset = row['B-offset']
            # Initialize clusters (AllenNLP expects clusters even if empty)
            clusters = []
            # Assuming your GAP dataset labels the true antecedent with either 'A' or 'B'            
            if row['A']:            
                clusters.append([[a_offset, a_offset + len(a) - 1], [pronoun_offset, pronoun_offset + len(pronoun) - 1]])
            elif row['B']:
                clusters.append([[b_offset, b_offset + len(b) - 1], [pronoun_offset, pronoun_offset + len(pronoun) - 1]])
            # Create a JSON object for the instance
            instance = {
                "text": text,
                "clusters": clusters
            }
            
            # Write the JSON object to the JSONL file
            jsonl_file.write(json.dumps(instance) + '\n')




In [15]:
for file in files:
    convert_to_jsonl(file)

### Check that the file has the right offsets

In [11]:
#check it worked
import json

# Open the JSONL file and read lines
with open('gap-development.jsonl', 'r') as jsonl_file:
    lines = jsonl_file.readlines()

# Loop through each line (JSON object)
for line in lines:
    instance = json.loads(line.strip())
    
    # Extract text and clusters
    text = instance['text']
    clusters = instance['clusters']
    
    # Check each cluster
    for cluster in clusters:
        antecedent_span = cluster[0]
        pronoun_span = cluster[1]
        
        # Extract the substring using the offsets
        antecedent_text = text[antecedent_span[0]:antecedent_span[1] + 1]
        pronoun_text = text[pronoun_span[0]:pronoun_span[1] + 1]
        
        print(f"Antecedent span: {antecedent_span}, Text: {antecedent_text} - Pronoun span: {pronoun_span}, Text: {pronoun_text}")
        # print()
        
        # Optionally, you can add assertions to make sure the text matches your expectations
        # assert antecedent_text == 'Expected_Antecedent_Text'
        # assert pronoun_text == 'Expected_Pronoun_Text'


Antecedent span: [191, 204], Text: Cheryl Cassidy - Pronoun span: [274, 276], Text: her
Antecedent span: [228, 236], Text: MacKenzie - Pronoun span: [284, 286], Text: His
Antecedent span: [173, 179], Text: Angeloz - Pronoun span: [265, 267], Text: his
Antecedent span: [174, 177], Text: Hell - Pronoun span: [321, 323], Text: his
Antecedent span: [219, 235], Text: Kitty Oppenheimer - Pronoun span: [437, 439], Text: She
Antecedent span: [236, 242], Text: Collins - Pronoun span: [411, 413], Text: She
Antecedent span: [152, 160], Text: Reb Asher - Pronoun span: [273, 275], Text: his
Antecedent span: [173, 180], Text: Greg Kot - Pronoun span: [337, 339], Text: his
Antecedent span: [255, 264], Text: Mary Paine - Pronoun span: [246, 248], Text: her
Antecedent span: [196, 213], Text: Christina Jennings - Pronoun span: [329, 331], Text: she
Antecedent span: [168, 183], Text: Peter Mansbridge - Pronoun span: [300, 302], Text: his
Antecedent span: [217, 223], Text: Eleanor - Pronoun span: [304, 30