In [2]:
import re
import json

import pandas as pd

from collections import Counter, defaultdict
from pprint import pprint

In [3]:
def no_corrections(alignment_dict):
    return len(alignment_dict) == 1 and len(alignment_dict['X']) == 0

In [4]:
def align_tokens(m2_block):
    unc_sentence, *edits = m2_block.splitlines()
    unc_tokens = re.split(
        r'\s+',
        unc_sentence
    )[1:]
    alignment_dict = {}
    alignment_dict['X'] = [] # For additions

    spans_outputs = []
    for edit in edits:
        coords, _, out, *_ = edit.split('|||')
        i, j = list(
            map(
                int,
                coords.split()[1:]))
        spans_outputs.append((
            (i,j),
            out))
    
    spans_outputs.sort()
    spans   = [el[0] for el in spans_outputs]
    outputs = [el[1] for el in spans_outputs]
    
    # If the edit is one-to-one, we align the beginning
    # of the edit with output_token_num.
    # We also keep track of one-word additions (zero-length input
    # spans) and deletions (zero-length output sequences).
    # Otherwise we skip the span and increase output_token_num
    # by the length of the output sequence.
    last_span_end = 0
    output_token_num = 0
    for idx in range(len(spans)):
        i, j = spans[idx]
        if i > last_span_end:
            output_token_num += i-last_span_end # Move cursor by the number of
                                                # copied tokens.
        out = outputs[idx]
#         print(spans[idx], f"inp: {' '.join(unc_tokens[i:j])}", f"out: {out}")
        out_len = len(out.split())
        if i-j == 0: # Addition
            if out_len == 1:
                alignment_dict['X'].append({
                    'idx': output_token_num,
                    'inp': ' '.join(unc_tokens[i:j]),
                    'out': out # For testing
                })
            output_token_num += out_len
        elif out == '': # Deletion
            if j-i == 1:
                alignment_dict[i] = 'X'
        elif j-i == out_len == 1: # One-to-one replacement
            alignment_dict[i] = {
                'idx': output_token_num,
                'inp': ' '.join(unc_tokens[i:j]),
                'out': out
            }
            output_token_num += 1
        else: # Something else
            output_token_num += out_len
        last_span_end = j
    return alignment_dict

In [5]:
def print_sentences(unc_tokens, cor_tokens):
    for i, tok in enumerate(unc_tokens):
        print(f'{tok}[{i}]', end=' ')
    print()
    for i, tok in enumerate(cor_tokens):
        print(f'{tok}[{i}]', end=' ')
    print('\n')
    
def test_alignment(m2_block, cor):
    cor_tokens = re.split(
        r'\s+',
        cor
    )
    cor = ' '.join(cor_tokens)
    unc_tokens = re.split(
        r'\s+',
        m2_block.splitlines()[0]
    )[1:]
    unc = ' '.join(unc_tokens)
    alignment_dict = align_tokens(m2_block)
    
    for k, v in alignment_dict.items():
        if v == 'X': # Don't test deletions for now
            continue
        if k == 'X':
            for el in v:
                i = el['idx']
                out = el['out']
                if cor_tokens[i] != out:
                    raise ValueError(f'{k}->{i}')
        else:
            i = v['idx']
            out = v['out']
            if cor_tokens[i] != out:
                raise ValueError(f'{k}->{i}')

In [6]:
def extract_features(field):
    """Returns CONLLU formatted UD features as a dictionary.
    A typical feature string:
    
    Case=Nom|Degree=Pos|Gender=Fem|Number=Sing
    
    Features and values are assumed to be one-to-one.
    """
    if field == '_':
        return {}
    return {
        feature: value for feature, value in [el.split('=') for el in field.split('|')]
    }

In [7]:
def conll2graph(record):
    """Converts sentences described using CoNLL-U format
    (http://universaldependencies.org/format.html) to graphs.
    Returns a dictionary of nodes (wordforms, POS tags, and feautures
    indexed by line numbers) together with a graph of the dependencies
    encoded as adjacency lists of (node_key, relation_label,
    direction[up or down]) tuples."""
    graph = {}
    nodes = {}
    for line in record.splitlines():
        if line.startswith("#"):
            continue
        fields = line.strip("\n").split("\t")
        key = fields[0]
        # Ignore compound surface keys for aux, du, etc.
        # Ignore hidden additional nodes for orphan handling
        if "-" in key or "." in key:
            continue
        wordform = fields[1]
        pos = fields[3]
        parent = fields[6]
        relation = fields[7]
        features = extract_features(fields[5])
        nodes[key] = {
            "wordform": wordform,
            "pos": pos,
            "relation": relation,
            "parent": parent,
            "features": features
        }
        if key not in graph:
            graph[key] = []
        if parent not in graph:
            graph[parent] = []
        graph[key].append((parent, relation, "up"))
        graph[parent].append((key, relation, "down"))
    return (nodes, graph)

In [8]:
def extract_sentence(nodes):
    tmp = []
    for v in nodes.values():
        tmp.append(v['wordform'])
    return ' '.join(tmp)

In [9]:
# Look for corrected sentences where the output as per alignment
# is not the same as per the edit annotation.
corrected = 0
errors = 0
for part in [
    'dev',
    'train',
    'test'
]:
    with open(f'm2_files/RULEC-GEC.{part}.M2', 'r') as inp:
        m2_dev_blocks = inp.read().strip().split('\n\n')
    with open(f'preprocessing/RULEC-GEC.{part}.corrected', 'r') as inp:
        m2_dev_cor_blocks = inp.readlines()
    assert(len(m2_dev_blocks) == len(m2_dev_cor_blocks))
    corrected = 0
    errors = 0
    for i in range(len(m2_dev_blocks)):
        alignment_dict = align_tokens(m2_dev_blocks[i])
        if no_corrections(alignment_dict):
            continue
        try:
            test_alignment(
                m2_dev_blocks[i],
                m2_dev_cor_blocks[i]
            )
            corrected += 1
        except ValueError:
            errors += 1
        except IndexError:
            errors += 1

In [10]:
corrected

2247

In [11]:
errors

23

In [12]:
# Align with UD trees produced by UDPipe and compute stats

aligned_graphs = []

additions_pos_stats = Counter()
additions_rel_stats = Counter()
deletions_pos_stats = Counter()
deletions_rel_stats = Counter()

rel_mismatches = []
pos_mismatches = []
# A dict of lists per feature per POS
# defaultdict needs a constructor as an argument, hence
# the anonymous function for embedded subtyping.
feature_mismatches = defaultdict(lambda: defaultdict(list))

added_prons = []

# A counter for features only found in parses of words from 
# uncorrected sentences.
disappearing_features = defaultdict(list)

errors = 0

for part in [
    'dev',
    'train',
    'test'
]:
    with open(f'm2_files/RULEC-GEC.{part}.M2', 'r') as inp:
        m2_dev_blocks = inp.read().strip().split('\n\n')
    with open(f'after_udpipe/RULEC-GEC.{part}.rus.conllu') as inp:
        ud_blocks_unc = inp.read().strip().split('\n\n')
    with open(f'after_udpipe/RULEC-GEC.{part}.corrected.rus.conllu') as inp:
        ud_blocks_cor = inp.read().strip().split('\n\n')
        
    assert(len(m2_dev_blocks) == len(ud_blocks_unc) == len(ud_blocks_cor))
    
    
    for i in range(len(m2_dev_blocks)):
        alignment_dict = align_tokens(m2_dev_blocks[i])
        unc_n, unc_g = conll2graph(ud_blocks_unc[i])
        cor_n, cor_g = conll2graph(ud_blocks_cor[i])
        aligned_graphs.append({
            'inp_n': unc_n,
            'inp_g': unc_g,
            'out_n': cor_n,
            'out_g': cor_g,
            'align': alignment_dict
        })
        # We assume that there are no wise-ass node IDs and that
        # they are simply node indices starting from 1.
        for k, v in alignment_dict.items():
            if k == 'X':
                for el in v:
                    idx = str(el['idx']+1)
                    additions_pos_stats[cor_n[idx]['pos']] += 1
                    additions_rel_stats[cor_n[idx]['relation']] += 1
                    
                    # Record added pronouns
                    if cor_n[idx]['pos'] == 'PRON':
                        added_prons.append((extract_sentence(unc_n), extract_sentence(cor_n)))
            elif v == 'X':
                idx = str(k+1)
                deletions_pos_stats[unc_n[idx]['pos']] += 1
                deletions_rel_stats[unc_n[idx]['relation']] += 1
            else:
                idx1 = str(k+1)
                idx2 = str(v['idx']+1)
                pos1 = unc_n[idx1]['pos']
                try:
                    pos2 = cor_n[idx2]['pos']
                except KeyError: # Computed output_token_num not found in the UD tree
                                 # Probably due to some weirdness.
                    errors += 1
                    continue
                if pos1 != pos2:
                    pos_mismatches.append((pos1, pos2))
                else:
                    feats1 = unc_n[idx1]['features']
                    feats2 = cor_n[idx2]['features']
                    for k in feats1:
                        try:
                            if feats1[k] != feats2[k]:
                                feature_mismatches[pos1][k].append((feats1[k], feats2[k]))
                        except KeyError:
                            disappearing_features[pos1].append(
                                f"{unc_n[idx1]['wordform']}, {k}: {feats1[k]} -> {cor_n[idx2]['wordform']}"
                            )
                            continue
                rel1 = unc_n[idx1]['relation']
                rel2 = cor_n[idx2]['relation']
                if rel1 != rel2:
                    rel_mismatches.append((rel1, rel2))

In [13]:
errors

5

In [15]:
len(added_prons)

111

In [12]:
# Makes sense
disappearing_features

defaultdict(list,
            {'VERB': ['зарождающиеся, Animacy: Inan -> зарождающееся',
              'произвело, Gender: Neut -> оказали',
              'говорит, Mood: Ind -> говорить',
              'говорит, Number: Sing -> говорить',
              'говорит, Person: 3 -> говорить',
              'говорит, Tense: Pres -> говорить',
              'появлявший, Gender: Masc -> проявлявшиеся',
              'написано, Gender: Neut -> написаны',
              'остается, Person: 3 -> остановилось',
              'начинается, Person: 3 -> началась',
              'желаемой, Gender: Fem -> желаемых',
              'учатся, Mood: Ind -> учиться',
              'учатся, Number: Plur -> учиться',
              'учатся, Person: 3 -> учиться',
              'учатся, Tense: Pres -> учиться',
              'было, Gender: Neut -> были',
              'грозил, Gender: Masc -> грозили',
              'позвонит, Mood: Ind -> позвонить',
              'позвонит, Number: Sing -> позвонить',
           

In [14]:
# Save the full report as JSON
report = {}
report['additions_pos_stats'] = additions_pos_stats
report['additions_rel_stats'] = additions_rel_stats
report['deletions_pos_stats'] = deletions_pos_stats
report['deletions_rel_stats'] = deletions_rel_stats
report['pos_mismatches'] = pos_mismatches
report['rel_mismatches'] = rel_mismatches
report['feature_mismatches'] = feature_mismatches

with open('report.json', 'w') as out:
    json.dump(report, out, indent=2, ensure_ascii=False)

In [67]:
# Save feature-mismatch matrices
f = open('feature_mismatches.txt', 'w')
for pos in feature_mismatches:
    print(pos, file=f)
    for feature in feature_mismatches[pos]:
        inp = []
        out = []
        print(f'\t{feature}', file=f)
        for pair in feature_mismatches[pos][feature]:
            a, b = pair
            inp.append(a)
            out.append(b)
        df_tmp = pd.DataFrame({
            'inp': inp,
            'out': out
        })
        cm = pd.crosstab(df_tmp['inp'], df_tmp['out'])
        print(cm.to_string(), file=f)
        print('', file=f)
f.close()

In [30]:
def query_feature_mismatch(aligned_graphs,
                           pos,
                           feature,
                           val1='any',
                           val2='any'):
    errors = 0
    for ags in aligned_graphs:
        al_dict = ags['align']
        if no_corrections(al_dict):
            continue
        for k, v in al_dict.items():
            if k == 'X' or v == 'X':
                continue
            i = str(k+1)
            j = str(v['idx']+1)
            try:
                inp_n = ags['inp_n'][i]
                inp_feats = inp_n['features']
                out_n = ags['out_n'][j]
                out_feats = out_n['features']
                f = feature
                if f in inp_feats \
                and f in out_feats \
                and inp_feats[f] != out_feats[f]:
                    if (inp_feats[f] == val1 or val1 == 'any') \
                    and (out_feats[f] == val2 or val2 == 'any'):
                        print(
                            f'{inp_n["wordform"]},{f}:{inp_feats[f]} -> {out_n["wordform"]},{f}:{out_feats[f]}'
                        )
                        keys = sorted(ags['inp_n'])
                        print(' '.join(ags['inp_n'][k]['wordform'] for k in keys))
                        keys = sorted(ags['out_n'])
                        print(' '.join(ags['out_n'][k]['wordform'] for k in keys))
                        print()
            except KeyError:
                errors += 1
                continue
    return errors

In [34]:
query_feature_mismatch(aligned_graphs,
                      'VERB',
                      'Voice',
                      'Act',
                      'Pass')

придумав,Voice:Act -> придуманный,Voice:Pass
Его " , придумав Ред Уассенич в 2000-ом году в Аустине базировали , Техасе , по той же причене . на модели лозунга " Keep Austin Weird
Он " , придуманный Ред Уассенич в 2000-ом году в Аустине основан , штат Техас по той же причине . на модели лозунга " Keep Austin Weird

загрязнают,Voice:Act -> загрязняются,Voice:Pass
Загрязняются океаны нефтом и контейнером с радиоактивными отходами . моря промышленным и радиоактивным мусором , и загрязнают
Загрязняются океаны нефтью и контейнером с радиоактивными отходами . моря промышленным и радиоактивным мусором , и загрязняются

тратившими,Voice:Act -> потраченными,Voice:Pass
И хорошие корпорации и негосударственные организации . много раз с избирательными долларами , тратившими в
И корпорации и негосударственные организации . много раз избирательными долларами , потраченными в хорошие

принимает,Voice:Act -> принимаются,Voice:Pass
Когда что принимает часто по-своему ( просвещённый абсолютизм ) , обычн

5