### Functions for processing the glove file

In [1]:
import sys

class InvalidBracketsException(Exception):
    pass

class InvalidNumberingException(Exception):
    pass

class InvalidAlignException(Exception):
    pass

def process_sen(sen):
    toks = []
    state = "normal"
    counter = 0
    for tok in sen.split():
        # checking valid brackets
        validate_brackets(tok)
        
        # processing
        if state == "normal":
            if tok == "[":
                state = "inside"
                actual_np = []
                continue
            else:
                toks.append(tok)
                continue
        elif state == "inside":
            if tok == "]":
                state = "after"
                continue
            else:
                actual_np.append(tok)
                continue
        elif state == "after":
            index = int(tok)
            # checking valid index ordering
            if index != counter:
                raise InvalidNumberingException
            
            counter += 1
            
            toks.append((index, actual_np))
            actual_np = []
            state = "normal"
            continue
    
    return toks
    
def validate_brackets(t):
    opener = t.find("[")
    closer = t.find("]")
    if opener > -1 or closer > -1:
        if len(t) > 1:
            sys.stderr.write("token: %s\n" % t)
            raise InvalidBracketsException
    

def extract_np_tok_indices(sen):
    indices = {}
    actual_index = 0
    for tok in sen:
        if isinstance(tok, tuple):
            index = tok[0]
            indices[index] = []
            for np_tok in tok[1]:
                indices[index].append(actual_index)
                actual_index += 1
        else:
            actual_index += 1
    return indices

c = u'\u2015'    
def process_aligns(als):
    aligns = []
    for align in als.split():
        try:
            if align.find(c) > -1:
                en = align.split(c)[0]
                hu = align.split(c)[1]
            else:
                en = align.split("-")[0]
                hu = align.split("-")[1]
        except IndexError:
            sys.stderr.write("%s\n" % als)
            raise Exception("Malformed aligned file")
        
        aligns.append((en, hu))
    return aligns
  
def validate_aligns(sen):
    en_sen = sen["en_sen"]
    hu_sen = sen["hu_sen"]
    aligns = sen["aligns"]
    
    en_np_indices = set([tok[0] for tok in en_sen if isinstance(tok, tuple)])
    hu_np_indices = set([tok[0] for tok in hu_sen if isinstance(tok, tuple)])
    for align in aligns:
        en_i = int(align[0].strip("sb"))
        hu_i = int(align[1].strip("sb"))
        if  not ( ( en_i in en_np_indices ) and ( hu_i in hu_np_indices ) ) :
            sys.stderr.write("%d-%d\n" % (en_i, hu_i) )
            raise InvalidAlignException

### Reading in the file, storing the sentences in a list of dictionaries

In [2]:
empty_sentence = {
    'id': None,
    'en_sen': None,
    'hu_sen': None,
    'aligns': None
}
sentences = []

actual_sentence = dict(empty_sentence)
state = "empty"
with open("/home/adaamko/data/1500-test.txt") as runga_input_file:
    for line in runga_input_file:
        if state == "empty":
            try:
                actual_sentence["id"] = int(line.strip())
            except ValueError:
                # reached end of file or malformed input
                continue
            state = "got_id"
            continue
        elif state == "got_id":
            try:
                actual_sentence["en_sen"] = process_sen(line.strip())
            except InvalidBracketsException:
                raise InvalidBracketsException("Invalid English bracketing in sentence: %d\n" % actual_sentence["id"])
            except InvalidNumberingException:
                raise InvalidNumberingException("Invalid English np numbering in sentence: %d\n" % actual_sentence["id"])
            except:
                raise Exception("Unknown error in english sentence: %d\n" % actual_sentence["id"])
            state = "got_en"
        elif state == "got_en":
            try:
                assert(line.strip() == "")
            except AssertionError:
                print(line)
                raise Exception("MyOwn")
            state = "wait_for_hu"
            continue
        elif state == "wait_for_hu":
            try:
                actual_sentence["hu_sen"] = process_sen(line.strip())
            except InvalidBracketsException:
                raise InvalidBracketsException("Invalid Hungarian bracketing in sentence: %d\n" % actual_sentence["id"])
            except InvalidNumberingException:
                raise InvalidNumberingException("Invalid Hungarian np numbering in sentence: %d\n" % actual_sentence["id"])
            state = "got_hu"
            continue
        elif state == "got_hu":
            assert(line.strip() == "")
            state = "wait_for_aligns"
            continue
        elif state == "wait_for_aligns":
            actual_sentence["aligns"] = process_aligns(line.strip())
            try:
                validate_aligns(actual_sentence)
            except InvalidAlignException:
                raise InvalidAlignException("Invalid np alignment in sentence: %d\n" % actual_sentence["id"])
            state = "got_align"
            continue
        elif state == "got_align":
            try:
                assert(line.strip() == "")
            except AssertionError:
                sys.stderr.write("%d\n" % actual_sentence["id"])
                raise Exception("Malformed input file")
            state = "wait_for_last_but_on_line"
            continue
        elif state == "wait_for_last_but_on_line":
            assert(line.strip() == "")
            state = "wait_for_last_line"
            continue
        elif state == "wait_for_last_line":
            if not line.strip() == "":
                sys.stderr.write("Missing last empty line after sentence: %d\n" % actual_sentence["id"])
                continue
            sentences.append(actual_sentence)
            actual_sentence = dict(empty_sentence)
            state = "empty"
            continue

### Reading in an english-hungarian dictionary

In [3]:
from collections import defaultdict
dictionary = defaultdict(list)
count=0
with open("/home/adaamko/data/hokoto", errors="replace") as f:
    for line in f:
        line = line.strip().split("@")
        if count<33:
            print(line)
        count+=1
        dictionary[line[0]].append(line[2])

["'sblood", 'O', 'a_kutya_teremte1sit']
["'sdeath", 'O', 'az_ista1llo1ja1t!']
["'shun", 'O', 'attention!']
["'shun", 'O', 'vigya1zz!']
["'tis", 'N', 'it_is']
["'tween-decks", 'N', 'fede1lko2z']
['-featured', 'A', '-arcu1']
['-featured', 'A', '-vona1su1']
['-fold', 'D', '-szeres(en)']
['-fold', 'D', '-szoros(an)']
['-haired', 'A', '-haju1']
['-haired', 'A', '-szo3ru3']
['-lived', 'A', '-e1letu3']
['-necked', 'N', 'cso1kolo1dza1s']
['-necked', 'N', 'o2lelkeze1s']
['-necked', 'N', 'oszlopnyak']
['-nosed', 'A', 'orru1']
['-oared', 'A', 'evezo3s']
['-ology', 'N', '-tudoma1ny']
['-paced', 'A', '-ja1ra1su1']
['-paced', 'A', '-le1ptu3']
['-roomed', 'A', '-szoba1s']
['-seater', 'N', '-u2le1su3']
['-sided', 'A', '-oldalu1']
['-sidedness', 'N', '-oldalu1sa1g']
['-sighted', 'A', '-la1ta1su1']
['-sighted', 'A', '-la1to1']
['-skulled', 'A', 'koponya1ju1']
['-sleeved', 'A', '-ujjas']
['-sleeved', 'A', '-ujju1']
['-tongued', 'A', '-hangu1']
['-tongued', 'A', '-nyelvu3']
['-tongued', 'A', '-szavu1']


#### Replacing characters to utf-8

In [4]:
dictionary_filtered = defaultdict(list)
old_char = ["a1", "e1", "u1", "i1", "o1", "A1", "E1", "U1", "I1", "O1", "o2", "u2", "O2", "U2", "o3", "u3", "O3", "U3", "_"]
new_char = ["á", "é", "ú", "í", "ó", "Á", "É", "Ú", "Í", "Ó", "ö", "ü", "Ö", "Ü", "ő", "ű", "Ő", "Ű", " "]

In [5]:
for i in dictionary:
    for j in range(len(dictionary[i])):
        for k in range(len(old_char)):
            dictionary[i][j] = dictionary[i][j].replace(old_char[k], new_char[k])

### Now we can start building the baseline method

In [123]:
from collections import defaultdict
import pprint
def compute_scores(sen, lem_en, lem_hu, dic):
    en_nps = {}
    hu_nps = {}
    for s in sen['en_sen']:
        if type(s) == tuple:
            lemmas = []
            for np in s[1]:
                lemma = lem_en(np)[0].lemma_
                if lemma == "-PRON-":
                    lemmas.append(np.lower())
                else:
                    lemmas.append(lemma)
            en_nps[s[0]] = lemmas
    for s in sen['hu_sen']:
        if type(s) == tuple:
            lemmas = []
            for np in s[1]:
                try:
                    lemmas.append(lem_hu.stem(np)[0][0])
                except IndexError:
                    print("indexerror")
                    return None
            hu_nps[s[0]] = lemmas
    scores = [[] for i in range(len(en_nps))]
    
    print(en_nps)
    print(hu_nps)
    dic_elements = defaultdict(list)
    for en_np in en_nps:
        for word in en_nps[en_np]:
            for el in dictionary[word]:
                dic_elements[word].append(el)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(dic_elements)
    for en_np in en_nps:
        for hu_np in hu_nps:
            l = []
            hu_lower = [s.lower() for s in hu_nps[hu_np]]
            for word in en_nps[en_np]:
                dic_elements = []
                for el in dictionary[word]:
                    if len(el.split()) > 1 and el.split()[1].startswith("("):
                        for x in lem_hu.stem(el.split()[0]):
                                dic_elements.append(x[0])
                    elif len(el.split()) >= 1 and el.split()[0].startswith("("):
                        for x in lem_hu.stem(el.split(")")[1].strip()):
                                dic_elements.append(x[0])
                    else:
                        for i in el.split():
                            if len(lem_hu.stem(i)) > 0:
                                for x in lem_hu.stem(i):
                                    dic_elements.append(x[0])
                inter = list(set(hu_lower) & set(dic_elements))
                l.append(len(inter) > 0)
            listmax = max([hu_lower, en_nps[en_np]], key=len)
            score = float(l.count(True)/len(listmax))
            scores[en_np].append(score)
    return scores

In [84]:
import spacy
import emmorphpy.emmorphpy as emmorph
import itertools
from itertools import permutations, repeat

nlp_en = spacy.load('en')
m = emmorph.EmMorphPy()

In [85]:
sentences_align = defaultdict(list)

In [124]:
def align_sentence(sen):
    scores = compute_scores(sen, nlp_en, m, dictionary)
    en_np_score = [x for x in range(len(scores))]
    hu_np_score = [x for x in range(len(scores[0]))]
    print(en_np_score)
    print(hu_np_score)
    a=[(x,y) for x in en_np_score for y in hu_np_score]
    permutations_aligned = []
    if len(hu_np_score) >= len(en_np_score):
        permut = list(list(zip(r, p)) for (r, p) in zip(repeat(en_np_score), permutations(hu_np_score)))
        max_score = []
        for perm in permut:
            max_score.append(sum([scores[x[0]][x[1]] for x in perm]))
        permutations_aligned.extend(permut[max_score.index(max(max_score))])
    else:
        permut = list(list(zip(r, p)) for (r, p) in zip(repeat(hu_np_score), permutations(en_np_score)))
        max_score = []
        for perm in permut:
            max_score.append(sum([scores[x[1]][x[0]] for x in perm]))
        aligns = [(x[1], x[0]) for x in permut[max_score.index(max(max_score))]]
        permutations_aligned.extend(aligns)
    return permutations_aligned
    

In [10]:
sentences[0]

{'aligns': [('1', '0'), ('2', '1'), ('3', '2')],
 'en_sen': [(0, ['It']),
  'was',
  (1, ['a', 'bright', 'cold', 'day', 'in', 'April']),
  ',',
  'and',
  (2, ['the', 'clocks']),
  'were',
  'striking',
  (3, ['thirteen']),
  '.'],
 'hu_sen': [(0, ['Derült', ',', 'hideg', 'áprilisi', 'nap']),
  'volt',
  ',',
  (1, ['az', 'órák']),
  'éppen',
  (2, ['tizenhármat']),
  'ütöttek',
  '.'],
 'id': 0}

In [108]:
import re
def align_sentence_asmax(sen):
    scores = compute_scores(sen, nlp_en, m, dictionary)
    if scores is None:
        return None
    aligns = []
    for i in range(len(scores)):
        for j,k in enumerate(scores[i]):
            if float(k) > 0.6:
                aligns.append((str(i), str(j)))
    return aligns

In [128]:
s = align_sentence_asmax(sentences[40])
s

{0: ['he'], 1: ['some', 'childhood', 'memory'], 2: ['that'], 3: ['him'], 4: ['london'], 5: ['this']}
{0: ['maga'], 1: ['valami', 'gyermekkori', 'emlék'], 2: ['amely'], 3: ['neki'], 4: ['ilyen'], 5: ['London']}
defaultdict(<class 'list'>,
            {   'childhood': ['gyerekkor', 'gyermekkor'],
                'he': ['férfi', 'hím (állat)', 'hímnemű személy', 'ő'],
                'him': ['őt'],
                'memory': ['emlékezőtehetség', 'memória'],
                'some': [   'bizonyos',
                            'egész',
                            'egy bizonyos',
                            'egy kevés',
                            'egy kis',
                            'egyes',
                            'igazi',
                            'körülbelül',
                            'komoly',
                            'meglehetősen',
                            'néhány',
                            'némely',
                            'némi',
                            'po

[('2', '4'), ('5', '4')]

In [111]:
guesses = []
senaligns = {}
for sentence in sentences:
    gold = sentence['aligns']
    gold_filtered = []
    for goldalign in gold:
        en = re.findall('\d+', goldalign[0] )
        hu = re.findall('\d+', goldalign[1] )
        gold_filtered.append((str(en[0]), str(hu[0])))
    al = align_sentence_asmax(sentence)
    senaligns[sentence['id']] = al
    if al is not None:        
        for i in al:
            if i in gold_filtered:
                guesses.append(True)
            else:
                guesses.append(False)

indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror
indexerror


In [112]:
score = float(guesses.count(True)/len(guesses))
np_len = 0
for sen in sentences:
    np_len += len(sen['aligns'])
score

0.6263473053892216

In [113]:
len(guesses)

835

In [129]:
senaligns[40]

[]

In [42]:

for i in range(50):
    try:
        sent = align_sentence(sentences[i])
    except:
        continue
    sentences_align[sentences[i]['id']] = sent

































In [39]:
sentences_align

defaultdict(list,
            {0: [(1, 0), (0, 1), (3, 2)],
             1: [(0, 0), (1, 1), (4, 2), (3, 3), (2, 4), (5, 5)],
             2: [(0, 0), (1, 1)],
             3: [(0, 0), (1, 1), (2, 2)],
             4: [(0, 1), (1, 0), (2, 4)],
             5: [(0, 0), (1, 1)],
             6: [(0, 1), (1, 0)],
             7: [(0, 0), (1, 1), (4, 2)],
             8: [(0, 0), (1, 1), (2, 4)],
             9: [(1, 0),
              (0, 1),
              (2, 2),
              (3, 3),
              (7, 4),
              (5, 5),
              (6, 6),
              (4, 7)],
             10: [(0, 1), (1, 2), (2, 3), (3, 4)],
             11: [(0, 1), (1, 2), (2, 3), (3, 0)],
             12: [(0, 0), (1, 1), (2, 2)],
             13: [(0, 0), (1, 1), (2, 4), (3, 3), (4, 5), (5, 2)],
             14: [(0, 0), (1, 1), (2, 2), (3, 4)],
             15: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 5)],
             16: [(0, 0), (1, 1)],
             17: [(0, 0), (1, 5), (2, 2), (3, 1), (4, 6)],
        

In [37]:
sent

[(1, 0), (0, 1), (3, 2)]