In [1]:
############################################
## Coreference resolution
############################################

# Step - 1: Installation instructions.
######################################
# 1. Download the code from https://github.com/mandarjoshi90/coref and unzip it.
# 2. Download spanbert large checkpoint from the same link

In [1]:
# Step - 2: Necessary installations
######################################

%cd coref-master

! pip install pyhocon
! pip install torch

! sed -i 's/-D_GLIBCXX_USE_CXX11_ABI=0//' setup_all.sh
! ./setup_all.sh 

In [3]:
# Step - 3: Setup env variables
######################################

genre = "nw"
model_name = "spanbert_large"

import os
os.environ['data_dir'] = "."
os.environ['CHOSEN_MODEL'] = model_name

# Determine Max Segment
max_segment = None
for line in open('experiments.conf'):
    if line.startswith(model_name):
        max_segment = True
    elif line.strip().startswith("max_segment_len"):
        if max_segment:
            max_segment = int(line.strip().split()[-1])
            break

In [2]:
# Step 4 - Convert raw input text to SpanBERT format
####################################################

from bert import tokenization
import json

example_text = [
"Acknowledging the Hurricane Katrina failures, the Bush administration advocated giving federal agencies from the Pentagon to the Department of Justice a greater role in the nation's disaster response playbook.",
"If adopted through both legislation and executive order, the recommendations would reverse some of the steps taken after the Sept. 11 terrorist attacks to centralize responsibility for responding to natural disasters or terrorist attacks at the newly created Department of Homeland Security.",
"And the plan could require the White House to play a larger coordinating role in future disasters.",
"Frances Fragos Townsend, President Bush's domestic security adviser, said that enlisting help from federal agencies made sense.",
"But some critics worry that diffusing responsibilities among agencies could leave no one clearly in charge and not produce results.", 
"''This may simply be rearranging the deck chairs on the Titanic,'' said Michael Greenberger, a law professor at the University of Maryland.", 
"The Homeland Security Department and its Federal Emergency Management Agency will continue to be the lead federal player in disaster response efforts, according to the blueprint proposed by Ms. Townsend.",
]

fout = open("sample.jsonl", 'w')

index = 0
for json_str in json_list:
    result = json.loads(json_str)
    text = example_text
    
    data = {
        'doc_id': result['id'],
        'doc_key': "nw",
        'sentences': [["[CLS]"]],
        'speakers': [["[SPL]"]],
        'clusters': [],
        'sentence_map': [0],
        'subtoken_map': [0],
    }


    tokenizer = tokenization.FullTokenizer(vocab_file="cased_config_vocab/vocab.txt", do_lower_case=False)
    subtoken_num = 0
    for sent_num, line in enumerate(text):
        
        if len(line) == 0:
            line = "."
        
        raw_tokens = line.split()
        tokens = tokenizer.tokenize(line)
        if len(tokens) + len(data['sentences'][-1]) >= max_segment:
            data['sentences'][-1].append("[SEP]")
            data['sentences'].append(["[CLS]"])
            data['speakers'][-1].append("[SPL]")
            data['speakers'].append(["[SPL]"])
            data['sentence_map'].append(sent_num - 1)
            data['subtoken_map'].append(subtoken_num - 1)
            data['sentence_map'].append(sent_num)
            data['subtoken_map'].append(subtoken_num)
            
        ctoken = raw_tokens[0]
        cpos = 0
        for token in tokens:
            data['sentences'][-1].append(token)
            data['speakers'][-1].append("-")
            data['sentence_map'].append(sent_num)
            data['subtoken_map'].append(subtoken_num)

            if token.startswith("##"):
                token = token[2:]
            if len(ctoken) == len(token):
                subtoken_num += 1
                cpos += 1
                if cpos < len(raw_tokens):
                    ctoken = raw_tokens[cpos]
            else:
                ctoken = ctoken[len(token):]

    data['sentences'][-1].append("[SEP]")
    data['speakers'][-1].append("[SPL]")
    data['sentence_map'].append(sent_num - 1)
    data['subtoken_map'].append(subtoken_num - 1)

    json.dump(data, fout, sort_keys=True)
    fout.write('\n')
    
    if index % 100 == 0:
        print(index, "samples processed.")
    index += 1
    
fout.close()

In [3]:
!GPU=0 python3 predict.py $CHOSEN_MODEL sample.jsonl sample_out.jsonl