In [47]:
from conllu import parse
import os
import shutil

def ontonotes_to_scorer(path_conllu, path_out):
    if os.path.exists(path_out):
            shutil.rmtree(path_out)
    os.makedirs(path_out, exist_ok=True)
    
    # iterate over files in dir
    for doc in os.listdir(path_conllu):
        print(doc)
        # read the file as string
        with open(os.path.join(path_conllu, doc), 'r', encoding='utf-8') as f:
            data = f.read()
        
        # parse all the fields in a given conllu document
        sentences = parse(data, 
                          fields=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", 
                                  "10", "11", "12", "13", "14", "15", "16", "17"])
        # the last sentence is empty; conllu.parse() expects 
        sentences = sentences[:-1]
        
        # rearrange data to make it suitable for neuralcoref
        for sentence in sentences:
            # remove unnessesary metadata
            sentence.metadata = {}
            for i, token in enumerate(sentence):
                t_form = token['1']
                
                # add doc name and part num to the beginning
                token['0'] = 'paw/' + doc[:-6]
                token['1'] = 0
                # shift id and form to the next cols 
                # id is a token number in the sentence, starting with i=0
                token['2'] = i
                token['3'] = t_form
                
                # remove irrelevant data 
                # neuralcoref looks for Speaker data in the 9th col; the corpus does not contain this information
                token['4'] = '_'                
                token['5'] = '_' 
                token['6'] = '_' 
                token['7'] = '_' 
                token['8'] = '_' 
                token['9'] = '_' 
                
                # remove unnecessary cols
                del token['10']
                del token['11']
                del token['12']
                del token['13']
                del token['14']
                del token['15']
                del token['16']
                
                # neuralcoref looks for coreference chains in the last col; it accepts '-' for empty values
                if token['17'] == '_':
                    token['17'] = '-'
        
        # write to .conll file
        with open(os.path.join(path_out, doc[:-6]+'.v4_gold_conll'), 'a', encoding='utf-8') as f:
            f.write('#begin document (paw/' + doc[:-6] + '); part 000\n')
            for sentence in sentences:
                f.write(sentence.serialize())
            f.write('#end document ')

In [49]:
path_conllu = '.\\corpora\\paws\\data\\conll'
path_out = '.\\corpora\\paws\\data\\conll_neuralcoref'

paws_to_neuralcoref(os.path.join(path_conllu), os.path.join(path_out))

wsj_1900.cs.conll
wsj_1900.en.conll
wsj_1900.pl.conll
wsj_1900.ru.conll
wsj_1901.cs.conll
wsj_1901.en.conll
wsj_1901.pl.conll
wsj_1901.ru.conll
wsj_1902.cs.conll
wsj_1902.en.conll
wsj_1902.pl.conll
wsj_1902.ru.conll
wsj_1903.cs.conll
wsj_1903.en.conll
wsj_1903.pl.conll
wsj_1903.ru.conll
wsj_1904.cs.conll
wsj_1904.en.conll
wsj_1904.pl.conll
wsj_1904.ru.conll
wsj_1905.cs.conll
wsj_1905.en.conll
wsj_1905.pl.conll
wsj_1905.ru.conll
wsj_1906.cs.conll
wsj_1906.en.conll
wsj_1906.pl.conll
wsj_1906.ru.conll
wsj_1907.cs.conll
wsj_1907.en.conll
wsj_1907.pl.conll
wsj_1907.ru.conll
wsj_1908.cs.conll
wsj_1908.en.conll
wsj_1908.pl.conll
wsj_1908.ru.conll
wsj_1909.cs.conll
wsj_1909.en.conll
wsj_1909.pl.conll
wsj_1909.ru.conll
wsj_1910.cs.conll
wsj_1910.en.conll
wsj_1910.pl.conll
wsj_1910.ru.conll
wsj_1911.cs.conll
wsj_1911.en.conll
wsj_1911.pl.conll
wsj_1911.ru.conll
wsj_1912.cs.conll
wsj_1912.en.conll
wsj_1912.pl.conll
wsj_1912.ru.conll
wsj_1913.cs.conll
wsj_1913.en.conll
wsj_1913.pl.conll
wsj_1913.r