In [None]:
import re
import networkx as nx
from networkx.readwrite import json_graph
from fourlang.text_to_4lang import TextTo4lang
from graphviz import Source
from scripts.parse_data import read
from scripts.similarity import Similarity
from tqdm import tqdm

In [None]:
data_frame = read("en", graded=True)
similarity = Similarity()

# CROSS LINGUAL

In [None]:
similarity.init_cross_lingual_embeddings("de", "it")
text_to_4lang_de = TextTo4lang(lang="de")
text_to_4lang_it = TextTo4lang(lang="it", port=5006)

#### Simple cross lingual baseline using muse embeddings

In [None]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    premise_src = True
    if premise.startswith("de"):
        prem_word = premise.split("de_")[1]
        premise_definition = text_to_4lang_de.get_definition(prem_word)
        hyp_word = hypothesis.split("it_")[1]
        hypothesis_definition = text_to_4lang_it.get_definition(hyp_word)
    else:
        prem_word = premise.split("it_")[1]
        premise_definition = text_to_4lang_it.get_definition(prem_word)
        hyp_word = hypothesis.split("de_")[1]
        hypothesis_definition = text_to_4lang_de.get_definition(hyp_word)
        premise_src = False
        
    if premise_definition is None:
        premise_definition = prem_word
    if hypothesis_definition is None:
        hypothesis_definition = hyp_word
        
    pred = similarity.compute_min_distance_scores(premise_definition, hypothesis_definition, premise_src)
    if pred > 0.2:
        preds.append(1)
    else:
        preds.append(0)

#### Simple 4lang based cross lingual using muse

In [None]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    
    premise_src = True
    if premise.startswith("de"):
        prem_word = premise.split("de_")[1]
        graph_premise = text_to_4lang_de.process_text(prem_word, True)
        hyp_word = hypothesis.split("it_")[1]
        graph_hypothesis = text_to_4lang_it.process_text(hyp_word, True)
    else:
        prem_word = premise.split("it_")[1]
        graph_premise = text_to_4lang_it.process_text(prem_word, True)
        hyp_word = hypothesis.split("de_")[1]
        graph_hypothesis = text_to_4lang_de.process_text(hyp_word, True)
        premise_src = False
        
    dot_graph_premise = graph_premise.to_dot()
    dot_graph_hypothesis = graph_hypothesis.to_dot()
    pred = similarity.muse_min_distance_4lang(graph_premise, graph_hypothesis, premise_src)
    if pred > 0.2:
        preds.append(1)
    else:
        preds.append(0)

## DICTIONARY BASED 

In [None]:
text_to_4lang_en = TextTo4lang(lang="en")
text_to_4lang_de = TextTo4lang(lang="de", port=5006)

In [None]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    premise_src = True
    if premise.startswith("en"):
        prem_word = premise.split("en_")[1]
        premise_definition = text_to_4lang_en.get_definition(prem_word)
        hyp_word = hypothesis.split("de_")[1]
        hypothesis_definition = text_to_4lang_de.get_definition(hyp_word)
    else:
        prem_word = premise.split("de_")[1]
        premise_definition = text_to_4lang_de.get_definition(prem_word)
        hyp_word = hypothesis.split("en_")[1]
        hypothesis_definition = text_to_4lang_en.get_definition(hyp_word)
        premise_src = False
        
    if premise_definition is None:
        premise_definition = prem_word
    if hypothesis_definition is None:
        hypothesis_definition = hyp_word
        
    pred = similarity.cross_lingual_dictionary_bag(premise_definition, hypothesis_definition, premise_src)
    if pred > 0.0:
        preds.append(1)
    else:
        preds.append(0)

# MONOLINGUAL

### Simple monolingual dictionary based baseline based on bag-of-words

In [None]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    premise_definition = text_to_4lang.get_definition(premise) #legyen-e expand
    hypothesis_definition = text_to_4lang.get_definition(hypothesis)
    if premise_definition is None:
        premise_definition = premise
    if hypothesis_definition is None:
        hypothesis_definition = hypothesis
        
    pred = similarity.asim_jac_words(premise_definition, hypothesis_definition)
    if pred > 0:
        preds.append(1)
    else:
        preds.append(0)

### Simple monolingual dictionary based baseline using 4lang expand

In [None]:
def asim_jac_nodes(graph_premise, graph_hypothesis):
    prem = set(graph_premise.get_nodes())
    hyp = set(graph_hypothesis.get_nodes())
    sim = hyp & prem
    if not sim or len(hyp) == 0:
        return 0
    else:
        return float(len(sim)) / len(hyp)

In [None]:
text_to_4lang_en = TextTo4lang(lang="en")
premise = text_to_4lang_en.process_text("shelf", True, 2)
hyp = text_to_4lang_en.process_text("closet",True, 1)

In [None]:
dot_graph_premise = premise.to_dot()
Source(dot_graph_premise)

In [None]:
Source(dot_graph_premise)

In [None]:
asim_jac_nodes(premise, hyp)

In [None]:
premise.filter_graph("part")
asim_jac_nodes(premise, hyp)

In [None]:
text_to_4lang_en.get_definition("bread")

In [None]:
text_to_4lang_en.parser_wrapper.load_from_dict()

In [None]:
preds = []

for i in tqdm(range(len(data_frame))):
    preds.append(1)

In [None]:
preds = []
preds_graded = []
for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    graph_premise = text_to_4lang_en.process_text(premise, True, 3, ["in", "of", "on"]) #legyen-e expand
    graph_hypothesis = text_to_4lang_en.process_text(hypothesis, True, 1, ["in", "of", "on"])
    graph_premise.filter_graph("part")
    graph_premise.filter_graph("on")
    graph_premise.filter_graph("use")
    pred = asim_jac_nodes(graph_premise, graph_hypothesis)
    preds_graded.append(pred)
    if pred == 1.0:
        preds.append(1.00)
    else:
        preds.append(0.00)

In [None]:
with open("result_graded_en", "w+") as f:
    for i,pred in enumerate(preds):
        premise = data_frame.premise[i]
        hypothesis = data_frame.hypothesis[i]
        f.write(premise + " " + hypothesis + " " + str(pred) + "\n")

In [None]:
g = text_to_4lang.process_text("husband", expand=True)

In [None]:
import json
text_to_4lang_en.parser_wrapper.save_dict()

In [None]:
g = text_to_4lang.process_text("illness", expand=True)
g.get_nodes()

In [None]:
for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    if score == 0 and preds[index] == 1:
        print("premise: " + premise + " " + "hyp: " + hypothesis)

In [None]:
import json
with open('exp.json', 'w') as fp:
    json.dump(text_to_4lang_en.lexicon.expanded, fp)