In [62]:
from conllu import parse
import os
import shutil

def paws_to_scorer(path_conllu, path_out, for_cort=False):
    if os.path.exists(path_out):
        shutil.rmtree(path_out)
    os.makedirs(path_out, exist_ok=True)
    
    # iterate over files in dir
    for doc in os.listdir(path_conllu):
        if not doc.endswith('.en.conll'):
            continue
        print(doc)
        # read the file as string
        with open(os.path.join(path_conllu, doc), 'r', encoding='utf-8') as f:
            data = f.read()
        
        # parse all the fields in a given conllu document
        sentences = parse(data, 
                          fields=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", 
                                  "10", "11", "12", "13", "14", "15", "16", "17"])
        # the last sentence is empty; conllu.parse() expects other formatting
        sentences = sentences[:-1]
        
        # rearrange data to make it suitable for scorer
        for sentence in sentences:
            # remove unnessesary metadata
            sentence.metadata = {}
            for i, token in enumerate(sentence):
                t_form = token['1']
                
                # add doc name and part num to the beginning
                token['0'] = doc[:-6]
                token['1'] = 0
                # shift id and form to the next cols 
                # id is a token number in the sentence, starting with i=0
                token['2'] = i
                token['3'] = t_form
                
                if token['4'] == '_':
                    token['4'] = '-'
                if token['5'] == '_':
                    token['5'] = '*'
                
                if for_cort:
                    start = 10
                else:
                    start = 4
                
                # remove unnecessary cols
                for i in range(start,17):
                    del token[str(i)]
                
                # the scorer accepts '-' for empty values
                if token['17'] == '_':
                    token['17'] = '-'
        
        # write to .conll file
        with open(os.path.join(path_out, doc[:-6]+'.conll'), 'a', encoding='utf-8') as f:
            f.write('#begin document (' + doc[:-6] + '); part 000\n')
            for sentence in sentences:
                f.write(sentence.serialize())
            f.write('#end document ')

In [57]:
path_conllu = '.\\corpora\\paws\\data\\conll'
path_out = '.\\scorer\\key'

paws_to_scorer(path_conllu, path_out)

wsj_1900.en.conll
wsj_1901.en.conll
wsj_1902.en.conll
wsj_1903.en.conll
wsj_1904.en.conll
wsj_1905.en.conll
wsj_1906.en.conll
wsj_1907.en.conll
wsj_1908.en.conll
wsj_1909.en.conll
wsj_1910.en.conll
wsj_1911.en.conll
wsj_1912.en.conll
wsj_1913.en.conll
wsj_1914.en.conll
wsj_1915.en.conll
wsj_1916.en.conll
wsj_1917.en.conll
wsj_1918.en.conll
wsj_1919.en.conll
wsj_1920.en.conll
wsj_1921.en.conll
wsj_1922.en.conll
wsj_1923.en.conll
wsj_1924.en.conll
wsj_1925.en.conll
wsj_1926.en.conll
wsj_1927.en.conll
wsj_1928.en.conll
wsj_1929.en.conll
wsj_1930.en.conll
wsj_1931.en.conll
wsj_1932.en.conll
wsj_1933.en.conll
wsj_1934.en.conll
wsj_1935.en.conll
wsj_1936.en.conll
wsj_1937.en.conll
wsj_1938.en.conll
wsj_1939.en.conll
wsj_1940.en.conll
wsj_1941.en.conll
wsj_1942.en.conll
wsj_1943.en.conll
wsj_1944.en.conll
wsj_1945.en.conll
wsj_1946.en.conll
wsj_1947.en.conll
wsj_1948.en.conll
wsj_1949.en.conll


In [64]:
from conllu import parse
import os
import shutil

def ontonotes_to_scorer(path_conllu, path_out, path_plain, for_cort=False):
    if os.path.exists(path_out):
        shutil.rmtree(path_out)
    os.makedirs(path_out, exist_ok=True)
    
    if not for_cort:
        if os.path.exists(path_plain):
            shutil.rmtree(path_plain)
        os.makedirs(path_plain, exist_ok=True)
    
    with open(path_conllu, 'r', encoding='utf-8') as f:
        data = f.read()

    # parse all the fields in a given conllu document
    sentences = parse(data, 
                      fields=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", 
                              "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"])
    # the last sentence is empty; conllu.parse() expects other formatting
    sentences = sentences[:-1]
    
    # rearrange data to make it suitable for scorer
    for sentence in sentences:
        for i, token in enumerate(sentence):
            
            # modify doc name for unification purposes
            token['0'] = token['0'].split('/')[-1]
            
            if for_cort:
                start = 11
                token['10'] = token[str(max(token, key=int))]
                if token['4'] == '_':
                    token['4'] = '-'
                if token['5'] == '_':
                    token['5'] = '*'
            else:
                start = 5
                token['4'] = token[str(max(token, key=int))]

            # remove unnecessary cols
            for i in range(start,20):
                if str(i) in token.keys():
                    del token[str(i)]

    # write to .conll file
    header = ''
    part = ''
    for sentence in sentences:
        if sentence[0]['0'] != header or sentence[0]['1'] != part:
            if header != '':
                with open(os.path.join(path_out, header.split('/')[-1] + '_' + part + '.en.conll'), 'a', encoding='utf-8') as f:
                    f.write('#end document\n')
            header = sentence[0]['0']
            part = sentence[0]['1']
            with open(os.path.join(path_out, header.split('/')[-1] + '_' + part + '.en.conll'), 'a', encoding='utf-8') as f:
                f.write('#begin document (' + header + '_' + part + '); part 000\n')
        with open(os.path.join(path_out, header.split('/')[-1] + '_' + part + '.en.conll'), 'a', encoding='utf-8') as f:
            f.write(sentence.serialize())
        if not for_cort:
            with open(os.path.join(path_plain, header.split('/')[-1] + '_' + part + '.en.txt'), 'a', encoding='utf-8') as f:
                f.write(' '.join([t['3'] for t in sentence]) + '\n')
        

In [50]:
path_conllu = '.\\OntoNotes\\test.english.v4_gold_conll'
path_out = '.\\scorer\\key_ontonotes'
path_plain = '.\\OntoNotes\\plain'

ontonotes_to_scorer(path_conllu, path_out, path_plain)

In [65]:
path_conllu = '.\\paws\\data\\conll'
path_out = '.\\scorer\\cort\\key'

paws_to_scorer(path_conllu, path_out, for_cort=True)

path_conllu = '.\\OntoNotes\\test.english.v4_gold_conll'
path_out = '.\\scorer\\cort\\key_ontonotes'
path_plain = ''

ontonotes_to_scorer(path_conllu, path_out, path_plain, for_cort=True)

wsj_1900.en.conll
wsj_1901.en.conll
wsj_1902.en.conll
wsj_1903.en.conll
wsj_1904.en.conll
wsj_1905.en.conll
wsj_1906.en.conll
wsj_1907.en.conll
wsj_1908.en.conll
wsj_1909.en.conll
wsj_1910.en.conll
wsj_1911.en.conll
wsj_1912.en.conll
wsj_1913.en.conll
wsj_1914.en.conll
wsj_1915.en.conll
wsj_1916.en.conll
wsj_1917.en.conll
wsj_1918.en.conll
wsj_1919.en.conll
wsj_1920.en.conll
wsj_1921.en.conll
wsj_1922.en.conll
wsj_1923.en.conll
wsj_1924.en.conll
wsj_1925.en.conll
wsj_1926.en.conll
wsj_1927.en.conll
wsj_1928.en.conll
wsj_1929.en.conll
wsj_1930.en.conll
wsj_1931.en.conll
wsj_1932.en.conll
wsj_1933.en.conll
wsj_1934.en.conll
wsj_1935.en.conll
wsj_1936.en.conll
wsj_1937.en.conll
wsj_1938.en.conll
wsj_1939.en.conll
wsj_1940.en.conll
wsj_1941.en.conll
wsj_1942.en.conll
wsj_1943.en.conll
wsj_1944.en.conll
wsj_1945.en.conll
wsj_1946.en.conll
wsj_1947.en.conll
wsj_1948.en.conll
wsj_1949.en.conll
