In [1]:
import spacy
from spacy.symbols import ORTH
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
import neuralcoref

nlp = spacy.load("en_core_web_sm")


coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')


# Add rule to not split on hyphens
def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)  # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")  # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)  # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])",
                           r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x]  # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                     suffix_search=nlp.tokenizer.suffix_search,
                     infix_finditer=infix_re.finditer,
                     token_match=nlp.tokenizer.token_match,
                     rules=nlp.Defaults.tokenizer_exceptions)


nlp.tokenizer = custom_tokenizer(nlp)

# Add any other special cases
nlp.tokenizer.add_special_case("90m", [{ORTH: "90m"}])
nlp.tokenizer.add_special_case("``", [{ORTH: "``"}])


# Parse in labels
from xml.dom import minidom

file = minidom.parse('data/WikiCoref/Annotation/Aberfoyle,_Stirling/Markables/Aberfoyle, Stirling_coref_level.xml')

items = []
models = file.getElementsByTagName('markable')
for model in models:
    item = {}
    for i in range(len(model.attributes)):
        item[model.attributes.item(i).nodeName] = model.attributes.item(i).value
    items.append(item)

    
# Create doc from words
doc = minidom.parse("data/WikiCoref/Annotation/Aberfoyle,_Stirling/Basedata/Aberfoyle, Stirling_words.xml")

words = doc.getElementsByTagName("word")

def getWord(word, offset=0):
    return {'id': f"{word.attributes.item(0).value.split('_')[0]}_{int(word.attributes.item(0).value.split('_')[1]) + offset}", "word": word.firstChild.data}

wordList = []
offset = 0
for word in words:
#     print(getWord(word))
    temp = getWord(word,offset)['word']
#     if "-" in temp and temp != '-LRB-' and temp != '-RRB-':
#             print(temp)
#             pos = list(findall('-',temp))
#             if len(pos) == 1:
#                 wordList.append(getWordSplit(word,offset,0))
#                 wordList.append(getHyphen(word,offset + 1))
#                 wordList.append(getWordSplit(word,offset + y2,1))
#                 offset += 2
#     else:
    wordList.append(getWord(word,offset))

    
s = ''
for entry in wordList:
    s += entry["word"]
    s += ' '

doc = nlp(s)

for l in wordList:
    i = int(l["id"].split("_")[1])
    if str(doc[i-1]) != l["word"]:
        print("spacy",doc[i-1])
        print("mmax",l["word"])
        break

In [2]:
print(items[0])

{'id': 'markable_318', 'span': 'word_1..word_1', 'coref_class': 'set_47', 'topic': 'http://rdf.freebase.com/ns/m.010kl', 'coreftype': 'ident', 'mentiontype': 'ne', 'mmax_level': 'coref'}


In [3]:
# TODO reenable printing

for item in items:
    span = item["span"]
    word1, word2 = map(lambda x: int(x.split("word_")[1]), span.split(".."))
    if word1 == word2:
        span = doc[word1-1]
        # print(span)
        # for coref_score in span._.coref_scores:
        #     print(coref_score)
    else:
        span = doc[(word1-1):word2]
#         print(span)
#         if span._.coref_scores is not None:
#             values = list(span._.coref_scores.values())
#             keys = list(span._.coref_scores.keys())
#             m = max(values)
#             i = values.index(m)
#             print(keys[i], m)
#         else:
#             print("No coreferring entities found.")
        
    # print()

In [4]:
import requests
import urllib.parse
import json



s = '''
Aberfoyle is a village in the region of Stirling, Scotland, northwest of Glasgow.
The town is situated on the River Forth at the base of Craigmore (420 metres high). Since 1885, when the Duke of Montrose constructed a road over the eastern shoulder of Craigmore to join the older road at the entrance of the Trossachs pass, Aberfoyle has become the alternative route to the Trossachs and Loch Katrine; this road, known as the Duke's Road or Duke's Pass, was opened to the public in 1931 when the Forestry Commission acquired the land.
Loch Ard, about two miles (3 km) west of Aberfoyle, lies 40 metres above the sea. It is three miles (5 km) long (including the narrows at the east end) and one mile (1½ km) broad. Towards the west end is Eilean Gorm (the green isle), and near the north-western shore are the falls of Ledard. Two miles northwest is Loch Chon, at 90m above the sea, 1+1/4 mi long, and about half a mile broad. It drains by the Avon Dhu to Loch Ard, which is drained in turn by the Forth.
As late as 1790, all the residents in the parish of Aberfoyle spoke Scottish Gaelic.
'''

doc = nlp(s)
text = s


URL = "https://coref.huggingface.co/coref?text=" + urllib.parse.quote(text)

r = requests.post(url=URL)
print(r.status_code)
# print(r.text)
print(json.dumps(r.json(), indent=2))

# with open('data.txt', 'w') as outfile:
#     json.dump(r.json(), outfile)


200
{
  "cleanedText": "",
  "corefResText": "",
  "coreferences": [],
  "mentions": [],
  "singleScores": {},
  "pairScores": {},
  "cleanedContext": "",
  "isResolved": false
}


In [5]:
print(doc._.coref_clusters)
print(doc._.coref_resolved)
print(doc._.coref_scores)



[
Aberfoyle: [
Aberfoyle, The town, Aberfoyle, Aberfoyle], the sea: [the sea, the sea], 1+1/4 mi: [1+1/4 mi, It]]

Aberfoyle is a village in the region of Stirling, Scotland, northwest of Glasgow.

Aberfoyle is situated on the River Forth at the base of Craigmore (420 metres high). Since 1885, when the Duke of Montrose constructed a road over the eastern shoulder of Craigmore to join the older road at the entrance of the Trossachs pass, 
Aberfoyle has become the alternative route to the Trossachs and Loch Katrine; this road, known as the Duke's Road or Duke's Pass, was opened to the public in 1931 when the Forestry Commission acquired the land.
Loch Ard, about two miles (3 km) west of Aberfoyle, lies 40 metres above the sea. It is three miles (5 km) long (including the narrows at the east end) and one mile (1½ km) broad. Towards the west end is Eilean Gorm (the green isle), and near the north-western shore are the falls of Ledard. Two miles northwest is Loch Chon, at 90m above the sea,

In [6]:
# import csv

# tsv_file = open("gap-development.tsv")
# read_tsv = csv.reader(tsv_file, delimiter="\t")

# count = 0
# for row in read_tsv:
#     if count > 0:
#         print(text)
#         text = row[1]
#         doc = nlp(text)
#         print(doc._.coref_clusters)
#     count += 1

# tsv_file.close()



In [8]:
text = u'''
Aberfoyle is a village in the region of Stirling, Scotland, northwest of Glasgow.
The town is situated on the River Forth at the base of Craigmore (420 metres high). Since 1885, when the Duke of Montrose constructed a road over the eastern shoulder of Craigmore to join the older road at the entrance of the Trossachs pass, Aberfoyle has become the alternative route to the Trossachs and Loch Katrine; this road, known as the Duke's Road or Duke's Pass, was opened to the public in 1931 when the Forestry Commission acquired the land.
Loch Ard, about two miles (3 km) west of Aberfoyle, lies 40 metres above the sea. It is three miles (5 km) long (including the narrows at the east end) and one mile (1½ km) broad. Towards the west end is Eilean Gorm (the green isle), and near the north-western shore are the falls of Ledard. Two miles northwest is Loch Chon, at 90m above the sea, 1+1/4 mi long, and about half a mile broad. It drains by the Avon Dhu to Loch Ard, which is drained in turn by the Forth.
As late as 1790, all the residents in the parish of Aberfoyle spoke Scottish Gaelic.
'''

doc = nlp(text)
print(doc._.coref_clusters)
print(doc._.coref_resolved)
print(doc._.coref_scores)

[
Aberfoyle: [
Aberfoyle, The town, Aberfoyle, Aberfoyle], the sea: [the sea, the sea], 1+1/4 mi: [1+1/4 mi, It]]

Aberfoyle is a village in the region of Stirling, Scotland, northwest of Glasgow.

Aberfoyle is situated on the River Forth at the base of Craigmore (420 metres high). Since 1885, when the Duke of Montrose constructed a road over the eastern shoulder of Craigmore to join the older road at the entrance of the Trossachs pass, 
Aberfoyle has become the alternative route to the Trossachs and Loch Katrine; this road, known as the Duke's Road or Duke's Pass, was opened to the public in 1931 when the Forestry Commission acquired the land.
Loch Ard, about two miles (3 km) west of Aberfoyle, lies 40 metres above the sea. It is three miles (5 km) long (including the narrows at the east end) and one mile (1½ km) broad. Towards the west end is Eilean Gorm (the green isle), and near the north-western shore are the falls of Ledard. Two miles northwest is Loch Chon, at 90m above the sea,