In [2]:
import requests
import json
import matplotlib
import re
import pydot
from graphviz import Digraph
from graphviz import Source
from baseline_utils import process_baseline
from nltk.corpus import stopwords
import pprint
from Levenshtein import distance

In [3]:
import networkx as nx
from networkx.readwrite import json_graph
from collections import defaultdict

In [4]:
sentences = process_baseline("/home/adaamko/data/1984.sen-aligned.np-aligned.gold")
len(sentences)

6567

In [5]:
import spacy
import hu_core_ud_lg

In [6]:
import emmorphpy.emmorphpy as emmorph

In [7]:
lem_hu = emmorph.EmMorphPy()

In [8]:
nlp_hu = hu_core_ud_lg.load()
nlp_en = spacy.load("en_core_web_sm")

In [9]:
import io
import numpy as np
import json
import request

In [10]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [11]:
src_path = '/home/adaamko/data/DMR/wiki.multi.en.vec'
tgt_path = '/home/adaamko/data/DMR/wiki.multi.hu.vec'
nmax = 250000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [12]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))
    words = []
    for i, idx in enumerate(k_best):
        words.append(tgt_id2word[idx])
    return words

In [13]:
src_word2id = {v: k for k, v in src_id2word.items()}
tgt_word2id = {v: k for k, v in tgt_id2word.items()}

def get_distance(src_word, tgt_word, src_emb, tgt_emb):
    src_word_emb = src_emb[src_word2id[src_word]]
    tgt_word_emb = tgt_emb[tgt_word2id[tgt_word]]
    score = (tgt_word_emb / np.linalg.norm(tgt_word_emb)).dot(src_word_emb / np.linalg.norm(src_word_emb))
    return score
    

In [14]:
src_word = 'Winston'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

KeyError: 'Winston'

In [19]:
src_word = "thirteen"
tgt_word = "tizenhárom"

get_distance(src_word, tgt_word, src_embeddings, tgt_embeddings)

0.7415513806514814

In [20]:
def filter_nps(nps, language):
    lemmas = []
    if language == "hu":
        doc = nlp_hu(nps.lower())
        for ent in doc:
            if not ent.is_stop:               
                try:
                    lemmas.append(lem_hu.stem(ent.lemma_)[0][0].lower())
                    lemmas.append(ent.lower_)
                except IndexError:
                    lemmas.append(ent.lower_)
        
    elif language == "en":
        doc = nlp_en(nps.lower())
        
        words = []

        for ent in doc:
            if not ent.is_stop:
                if ent.lemma_ == "-PRON-":
                    words.append(ent.lower_)
                else:
                    words.append(ent.lemma_.lower())
                    words.append(ent.lower_)

        if not words:
            for ent in doc:
                words.append(ent.lower_)

        lemmas += words
        
    return list(set(lemmas))

In [21]:
def return_morph(word):
    s = lem_hu.analyze(word)
    ret_list = []
    for i in s:
        i = i.split("=")
        morpheme = i[1]
        morpheme = morpheme.split("+")
        for m in morpheme:
            mor = m.split("[")[0].strip()
            if len(mor) > 2:
                ret_list.append(m.split("[")[0].strip())
    return ret_list

In [22]:
from collections import Counter

def count_vowels(word):
    c = {v:word.count(v) for v in 'aeuioáéúüűíóöő'}
    count = sum(c.values())
    return count

In [30]:
def compute_min_distance_scores(sen):
    en_nps = {}
    hu_nps = {}
    for s in sen['en_sen']:
        if type(s) == tuple:
            np_to_filter = s[1]
            if not np_to_filter:
                for np in sen['en_sen']:
                    if type(np) == tuple:
                        if np[0] == s[0]:
                            np_to_filter = np[1]
            lemmas = filter_nps(' '.join(np_to_filter), "en")
            en_nps[s[0]] = lemmas
            
    for s in sen['hu_sen']:
        if type(s) == tuple:
            np_to_filter = s[1]
            if not np_to_filter:
                for np in sen['hu_sen']:
                    if type(np) == tuple:
                        if np[0] == s[0]:
                            np_to_filter = np[1]
            lemmas = filter_nps(' '.join(np_to_filter), "hu")
            hu_nps[s[0]] = lemmas
    
    scores = [[] for i in range(len(en_nps))]
    
    for en_np in en_nps:
        for hu_np in hu_nps:
            hu_lower = [s.lower() for s in hu_nps[hu_np]]
            add_morphs = []
            
            #for low in hu_lower:
            #    if count_vowels(low) >= 3:
             #       m = return_morph(low)
                    #add_morphs += m
                    
            #for addit in add_morphs:
             #   if addit not in hu_lower:
                    #hu_lower.append(addit)
                    
            max_score = 0
            for word in en_nps[en_np]:
                w = word.strip("-").lower()
                for hu_word in hu_lower:
                    try:
                        distance = get_distance(w, hu_word, src_embeddings, tgt_embeddings)
                    except KeyError:
                        distance = 0
                    if distance > max_score:
                        max_score = distance
            scores[en_np].append(max_score)
    return scores

In [24]:
def compute_KNN_scores(sen):
    en_nps = {}
    hu_nps = {}
    for s in sen['en_sen']:
        if type(s) == tuple:
            np_to_filter = s[1]
            if not np_to_filter:
                for np in sen['en_sen']:
                    if type(np) == tuple:
                        if np[0] == s[0]:
                            np_to_filter = np[1]
            lemmas = filter_nps(' '.join(np_to_filter), "en")
            en_nps[s[0]] = lemmas
            
    for s in sen['hu_sen']:
        if type(s) == tuple:
            np_to_filter = s[1]
            if not np_to_filter:
                for np in sen['hu_sen']:
                    if type(np) == tuple:
                        if np[0] == s[0]:
                            np_to_filter = np[1]
            lemmas = filter_nps(' '.join(np_to_filter), "hu")
            hu_nps[s[0]] = lemmas
    
            
    scores = [[] for i in range(len(en_nps))]
    
    for en_np in en_nps:
        word_to_embed = defaultdict(list)
        for word in en_nps[en_np]:
            dic_elements = []
            w = word.strip("-").lower()
            try:
                elements = get_nn(w, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)
            except KeyError:
                elements = [w]

            for el in elements:
                 #for i in el.split():
                doc = nlp_hu(el)
                lem = doc[0].lemma_
                dic_elements.append(el)
                dic_elements.append(lem)


            add_morphs_en = []
            for el in dic_elements:
                if count_vowels(el) > 3:
                    m = return_morph(el)
                    add_morphs_en += m
            for addit in add_morphs_en:
                if addit not in dic_elements:
                    dic_elements.append(addit)
            word_to_embed[word] += dic_elements

        for hu_np in hu_nps:
            l = []
            hu_lower = [s.lower() for s in hu_nps[hu_np]]
            add_morphs = []
            
            for low in hu_lower:
                if count_vowels(low) >= 3:
                    m = return_morph(low)
                    add_morphs += m
                    
            for addit in add_morphs:
                if addit not in hu_lower:
                    hu_lower.append(addit)
                                                
            for word in word_to_embed:                       
                inter = []
                for en_word in word_to_embed[word]:
                    for hu_word in hu_lower:
                        dis = distance(en_word, hu_word)
                        
                        if(len(en_word) > 5 or len(hu_word) > 5):
                            if dis < 3:
                                inter.append(True)
                        else:
                            if dis < 1:
                                inter.append(True)
                if len(inter) > 0:
                    l.append(True)
                   
            listmax = max([hu_lower, en_nps[en_np]], key=len)
            if len(l) == 0:
                score = 0
            else:
                score = float(l.count(True)/len(l))
            scores[en_np].append(score)
            
    return scores

In [25]:
def process(sen):
    scores = compute_min_distance_scores(sen)
    if scores is None:
        return None
    aligns = []
    for i in range(len(scores)):
        for j,k in enumerate(scores[i]):
            if float(k) > 0.5:
                aligns.append((str(i), str(j)))
    return aligns

In [26]:
process(sentences[1])

[('0', '0'), ('3', '3'), ('4', '2'), ('5', '5')]

In [None]:
guesses = []
senaligns = {}
for i,sentence in enumerate(sentences):
    print(sentence['id'])
    gold = sentence['aligns']
    gold_filtered = []
    for goldalign in gold:
        en = re.findall('\d+', goldalign[0] )
        hu = re.findall('\d+', goldalign[1] )
        gold_filtered.append((str(en[0]), str(hu[0])))
    al = process(sentence)
    senaligns[sentence['id']] = al
    if al is not None:        
        for i in al:
            if i in gold_filtered:
                guesses.append(True)
            else:
                guesses.append(False)

In [32]:
score = float(guesses.count(True)/len(guesses))
np_len = 0
for sen in sentences:
    np_len += len(sen['aligns'])
print(score)
print(np_len)
print(len(guesses))
recall = (score * len(guesses)) / np_len
f1_score = (2*recall*score)/(recall+score)
print(recall)
print(f1_score)

0.7603278688524591
18848
12200
0.4921477079796265
0.5975264107188868
