In [1]:
from conllu import parse
import os
import shutil

def rucor_to_neuralcoref(path_conllu, path_out):
    if os.path.exists(path_out):
            shutil.rmtree(path_out)
    os.makedirs(path_out, exist_ok=True)
    
    # iterate over files in dir
    for doc in os.listdir(path_conllu):
        print(doc)
        # read the file as string
        with open(os.path.join(path_conllu, doc), 'r', encoding='utf-8') as f:
            data = f.read()
        
        # parse Rucor conllu from string
        # coreference data is written in the "misc" col and divided by "|"
        split_func = lambda line, i: line[i].split("|")
        sentences = parse(data, 
                          fields=["id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"],  
                          field_parsers={"misc": split_func})
        
        dirt = []
        # extract group_id and chain_id from the col 'misc' and add them as cols
        for sentence in sentences:
            # remove unnessesary metadata
            sentence.metadata = {}
            for token in sentence:
                # the corpus contains non-sentence data; we'll remove it later   
                if not isinstance(token['id'], int):
                    dirt.append(sentence)
                    break
                    
                # remember id (it should start from 0 instead of 1) and form
                t_id = token['id'] - 1
                t_form = token['form']
                
                # add doc name and part num to the beginning; shift id and form to the next cols
                token['id'] = path_out.split(os.path.sep)[-1][-3:] + '/' + doc[:-6]
                token['form'] = 0
                token['lemma'] = t_id
                token['upos'] = t_form
                
                # read coref data and empty the col
                # neuralcoref looks for Speaker data in the 9th col ("misc"); the corpus does not contain this information
                attrs = token['misc']
                token['misc'] = None

                # extract group_id from coref data and add it to conllu
                matching = [s for s in attrs if "RuCor_group_id" in s]
                if matching:
                    token['group_id'] = matching[0].split('=')[1]
                    if len(matching) > 1:
                        print(len(matching))
                else:
                    token['group_id'] = '_'

                # extract chain_id from coref data and add it to conllu
                matching = [s for s in attrs if "RuCor_chain_id" in s]
                if matching:
                    token['chain_id'] = matching[0].split('=')[1]
                    if len(matching) > 1:
                        print(len(matching))
                else:
                    token['chain_id'] = '_'
                
                # neuralcoref looks for coreference chains in the last col; it accepts '-' for empty values
                token['coref'] = '-'
        
        # remove dirty data
        if dirt:
            for d in dirt:
                print(' '.join(str(t['form']) for t in d))
            sentences = [a for a in sentences if a not in dirt]
        
        # add coreference chains to token["coref"]
        # neuralcoref expects chains like "(chain_id)",
        # where "(" and ")" represent the beginning and the end of a unique coreference group described by group_id
        end = {}
        for i, sentence in enumerate(sentences):
            for j, token in enumerate(sentence):

                if token['chain_id'] != '_':
                    # new part of the group
                    if token['group_id'] in end:
                        token['coref'] = token['chain_id'] + ')'
                        # remove ')' from the previous token of the group
                        i_prev, j_prev =  end[token['group_id']]
                        sentences[i_prev][j_prev]['coref'] = sentences[i_prev][j_prev]['coref'][:-1]
                    # first occurence
                    else:
                        token['coref'] = '(' + token['chain_id'] + ')'

                    # remember coordinates of the last seen group_id
                    end[token['group_id']] = (i, j)
        
        # write to .conll file
        with open(os.path.join(path_out, doc[:-6]+'.v4_gold_conll'), 'a', encoding='utf-8') as f:
            f.write('#begin document (' + path_out.split(os.path.sep)[-1][-3:] + '/' + doc[:-6] + '); part 000\n')
            for sentence in sentences:
                f.write(sentence.serialize())
            f.write('#end document ')

In [29]:
path_conllu = '.\\rucoref_29.10.2015\\parsed_testset'
path_out = '.\\rucoref_29.10.2015\\parsed_testset_neuralcoref'

for doc in os.listdir(path_conllu):
    rucor_to_neuralcoref(os.path.join(path_conllu, doc), os.path.join(path_out, doc))

102_beliajev_nad_bezdnoj.conll
107_dragunsky_volshebnaja_sila_iskusstva.conll
15_paustovsky_zhilcy_starogo_doma.conll
2_astafiev_zhizn_prozhit.conll
30_dojl_sluchaj.conll
34_kassil_solnce_svetit.conll
43_musatov_stozhary.conll
44_nagibin_siren.conll
53_beliajev_dom_s_prividenijami.conll
5_petrushevskaya_v_detstve.conll
67_zamiatin_kolumb.conll
73_ilf_schastlivy_otec.conll
andersen_motylek.conll
bazhov_travyanaja_zapadenka.conll
bunin_skazka.conll
dostojevskij_podrostok.conll
dovlatov_kompromiss_6.conll
fet_knyaginya.conll
gilyarovskij_moi_skitanija.conll
gogol_zapiski_3.conll
harms_upadanije.conll
_ сентября _
korolenko_mgnovenije.conll
strugackije_ponedelnik.conll
turgenev_veshnije_vody.conll
2013_04_11_dotless_.conll
2013_07_31_krebs_.conll
lenta.ru-news-2014-01-19-cutshort.conll
lenta.ru-news-2014-01-24-if.conll
lenta.ru-news-2014-01-30-crimea.conll
lenta.ru-news-2014-02-03-capitanic.conll
lenta.ru-news-2014-02-03-london.conll
lenta.ru-news-2014-02-03-name1.conll
lenta.ru-news-2014-

philology.ru-linguistics1-alpatov-12-out2.conll
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 _ .
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 _ , отсюда миф об их происхождении от смешения людей с собаками .
philology.ru-linguistics1-barannikov-46-out2.conll
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 _ ) , полного ( 1852-1875 ) и краткого ( 1879-1889 ) .
0 0 0 0 0 0 0 0 0 0 0 _ , принимали участие не только работники Академии , но и многочисленные ученые разных стран Европы .
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 _ по линии изучения буддизма .
