In [1]:
import re
import networkx as nx
from networkx.readwrite import json_graph
from fourlang.text_to_4lang import TextTo4lang
from graphviz import Source
from scripts.parse_data import read
from scripts.similarity import Similarity
from tqdm import tqdm

In [2]:
data_frame = read("en", "de", graded=False)
similarity = Similarity()

# CROSS LINGUAL

In [3]:
similarity.init_cross_lingual_embeddings("de", "it")
text_to_4lang_de = TextTo4lang(lang="de")
text_to_4lang_it = TextTo4lang(lang="it", port=5006)

#### Simple cross lingual baseline using muse embeddings

In [9]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    premise_src = True
    if premise.startswith("de"):
        prem_word = premise.split("de_")[1]
        premise_definition = text_to_4lang_de.get_definition(prem_word)
        hyp_word = hypothesis.split("it_")[1]
        hypothesis_definition = text_to_4lang_it.get_definition(hyp_word)
    else:
        prem_word = premise.split("it_")[1]
        premise_definition = text_to_4lang_it.get_definition(prem_word)
        hyp_word = hypothesis.split("de_")[1]
        hypothesis_definition = text_to_4lang_de.get_definition(hyp_word)
        premise_src = False
        
    if premise_definition is None:
        premise_definition = prem_word
    if hypothesis_definition is None:
        hypothesis_definition = hyp_word
        
    pred = similarity.compute_min_distance_scores(premise_definition, hypothesis_definition, premise_src)
    if pred > 0.2:
        preds.append(1)
    else:
        preds.append(0)

100%|██████████| 446/446 [00:18<00:00, 24.49it/s]


#### Simple 4lang based cross lingual using muse

In [11]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    
    premise_src = True
    if premise.startswith("de"):
        prem_word = premise.split("de_")[1]
        graph_premise = text_to_4lang_de.process_text(prem_word, True)
        hyp_word = hypothesis.split("it_")[1]
        graph_hypothesis = text_to_4lang_it.process_text(hyp_word, True)
    else:
        prem_word = premise.split("it_")[1]
        graph_premise = text_to_4lang_it.process_text(prem_word, True)
        hyp_word = hypothesis.split("de_")[1]
        graph_hypothesis = text_to_4lang_de.process_text(hyp_word, True)
        premise_src = False
        
    dot_graph_premise = graph_premise.to_dot()
    dot_graph_hypothesis = graph_hypothesis.to_dot()
    pred = similarity.muse_min_distance_4lang(graph_premise, graph_hypothesis, premise_src)
    if pred > 0.2:
        preds.append(1)
    else:
        preds.append(0)

100%|██████████| 446/446 [07:39<00:00,  1.21s/it]


## DICTIONARY BASED 

In [3]:
similarity.init_dictionaries("en", "de")
text_to_4lang_en = TextTo4lang(lang="en")
text_to_4lang_de = TextTo4lang(lang="de", port=5006)

In [4]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    premise_src = True
    if premise.startswith("en"):
        prem_word = premise.split("en_")[1]
        premise_definition = text_to_4lang_en.get_definition(prem_word)
        hyp_word = hypothesis.split("de_")[1]
        hypothesis_definition = text_to_4lang_de.get_definition(hyp_word)
    else:
        prem_word = premise.split("de_")[1]
        premise_definition = text_to_4lang_de.get_definition(prem_word)
        hyp_word = hypothesis.split("en_")[1]
        hypothesis_definition = text_to_4lang_en.get_definition(hyp_word)
        premise_src = False
        
    if premise_definition is None:
        premise_definition = prem_word
    if hypothesis_definition is None:
        hypothesis_definition = hyp_word
        
    pred = similarity.cross_lingual_dictionary_bag(premise_definition, hypothesis_definition, premise_src)
    if pred > 0.0:
        preds.append(1)
    else:
        preds.append(0)

100%|██████████| 418/418 [00:17<00:00, 23.39it/s]


# MONOLINGUAL

### Simple monolingual dictionary based baseline based on bag-of-words

In [5]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    premise_definition = text_to_4lang.get_definition(premise) #legyen-e expand
    hypothesis_definition = text_to_4lang.get_definition(hypothesis)
    if premise_definition is None:
        premise_definition = premise
    if hypothesis_definition is None:
        hypothesis_definition = hypothesis
        
    pred = similarity.asim_jac_words(premise_definition, hypothesis_definition)
    if pred > 0:
        preds.append(1)
    else:
        preds.append(0)

100%|██████████| 316/316 [00:13<00:00, 23.04it/s]


### Simple monolingual dictionary based baseline using 4lang expand

In [6]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    graph_premise = text_to_4lang.process_text(premise, True) #legyen-e expand
    graph_hypothesis = text_to_4lang.process_text(hypothesis, True)
    dot_graph_premise = graph_premise.to_dot()
    dot_graph_hypothesis = graph_hypothesis.to_dot()
    pred = similarity.asim_jac_nodes(graph_premise, graph_hypothesis)
    if pred > 0:
        preds.append(1)
    else:
        preds.append(0)

100%|██████████| 316/316 [06:55<00:00,  1.34s/it]


In [5]:
with open("cross_result_binary_bag", "w+") as f:
    for i,pred in enumerate(preds):
        premise = data_frame.premise[i]
        hypothesis = data_frame.hypothesis[i]
        f.write(premise + " " + hypothesis + " " + str(pred) + "\n")

In [27]:
g = text_to_4lang.process_text("husband", expand=True)

In [28]:
g.get_nodes()

['husband', 'master', 'house', 'householder', 'head', 'family']

In [22]:
g = text_to_4lang.process_text("illness", expand=True)
g.get_nodes()

['illness', 'instance', 'disease', 'health', 'poor']

In [5]:
similarity.asim_jac_words(text_to_4lang.get_definition("gang"), text_to_4lang.get_definition("group"))

{'person', '.', 'thing', 'number', 'relation'}
{'.', 'walk', 'proceed', ';'}


0.2