This code is based on the [source](https://www.kaggle.com/sattree/1-coref-visualization-jupyter-allenlp-stanford)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pickle
import ast
from unidecode import unidecode

### Choose one of parsers (AllenNLP or Stanford)

In [None]:
from allennlp.predictors.predictor import Predictor

coref_resol = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz")

In [None]:
from stanfordcorenlp import StanfordCoreNLP
from nltk.parse.corenlp import CoreNLPParser

STANFORD_CORENLP_PATH = 'stanford-corenlp-full-2018-10-05/'
PORT = 9090
try:
    server = StanfordCoreNLP(STANFORD_CORENLP_PATH, port=PORT, quiet=True)
except OSError as e:
    print('The port is occupied, probably an instance is already running.')
    server = StanfordCoreNLP('http://localhost', port=PORT, quiet=True)
    
STANFORD_SERVER_URL = server.url

stanford_model = CoreNLPParser(url=STANFORD_SERVER_URL)

# If annotators are not preloaded, stanford model can take a while for the first call and may even timeout
# make a dummy call to the server
try:
    stanford_model.api_call('This is a dummy text.', properties={'annotators':  'tokenize,ssplit,pos,lemma,ner,parse,dcoref'})
except:
    pass

In [None]:
wiki = pd.read_csv("FEVER_data/wiki_pages.csv")

In [None]:
w = dict()
w_decoded = dict()
for r in wiki.iterrows():
    if r[1]['id'] == r[1]['id']:
        w[unidecode(r[1]['id'])] = r[1]['text']
        w_decoded[unidecode(r[1]['id'])] = r[1]['id']
del wiki

### Choose data (dev/test)

In [None]:
train = pd.read_csv("FEVER_data/shared_task_dev.csv") 
#train = pd.read_csv("FEVER_data/train.csv")
train['evidence'] = train['evidence'].apply(lambda x: ast.literal_eval(x))
claims = train.claim.values
evidences = train.evidence.values
labels = train.label.values
ver = train.verifiable.values

In [None]:
train = pd.read_csv("FEVER_data/shared_task_test.csv") 
claims = train.claim.values

In [None]:
train.head()

### Create and save coreference lists

In [None]:
with open('documents_dev.pickle', 'rb') as f:
    documents = pickle.load(f)

In [None]:
coref = []

In [None]:
with open('coreferences_dev_stanford.pickle', 'rb') as f:
    coref = pickle.load(f)   # in case of previous runs

In [None]:
length = 7000
for i in tqdm(range(length)):
    coref.append({})
    for title in documents[i]:
        title = unidecode(title.replace('(', '-LRB-').replace(')', '-RRB-').replace(':' , '-COLON-'))
        if title in w:
            text = w[title]
            try:
                corefs = stanford_model.api_call(text, properties={'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'})
                coref[i][title] = corefs
            except:
                pass

In [None]:
with open('coreferences_dev_stanford.pickle', 'wb') as f:
    pickle.dump(coref, f)

### Visualization

In [None]:
from IPython.core.display import display, HTML
# Add css styles and js events to DOM, so that they are available to rendered html
display(HTML(open('gpr_pub/visualization/highlight.css').read()))
display(HTML(open('gpr_pub/visualization/highlight.js').read()))

In [None]:
from gpr_pub import visualization

In [None]:
text = 'Andrew Kevin Walker -LRB- born August 14 , 1964 -RRB- is an American BAFTA-nominated screenwriter . He is known for having written Seven -LRB- 1995 -RRB- , for which he earned a nomination for the BAFTA Award for Best Original Screenplay , as well as several other films , including 8mm -LRB- 1999 -RRB- , Sleepy Hollow -LRB- 1999 -RRB- and many uncredited script rewrites . '
data = stanford_model.api_call(text, properties={'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'})
visualization.render(data, stanford=True, jupyter=True)