In [None]:
from fourlang.text_to_4lang import TextTo4lang
from fourlang.lexicon import Lexicon
from graphviz import Source
from scripts.parse_data import read, read_sherliic, build_graph
from scripts.similarity import Similarity

from tqdm import tqdm
preds = []
text_to_4lang = TextTo4lang(lang="en")
sherlic_data = read_sherliic("data/dev.csv", ud_path="data/relation_index.tsv", keep_context=True)
sherlic_data_frame = build_graph(sherlic_data)

lexicon = Lexicon(lang="en")

similarity = Similarity(with_embedding=False)

In [None]:
semeval_data_frame = read("en", graded=False)

In [None]:
sherlic_data

In [None]:
import re
def clear_node(node):
    """
    Clears the node from the 4lang id parts
    :param node: the text to clear
    :return: the cleared text
    """
    return re.sub(r'_[0-9][0-9]*', '', node.lower())

In [None]:
import math

def asim_jac_edges(graph_premise, graph_hypothesis):
    """
    Asymmetric Jaccard similarity between the edges of the definition graphs
    :param graph_premise: the definition graph of the premise
    :param graph_hypothesis: the definition graph of the hypothesis
    :return: the ratio of overlapping edges per the length of the hypothesis definition
    """
    prem = set([(clear_node(s), clear_node(r), e['color'])
                for (s, r, e) in graph_premise.G.edges(data=True)])
    hyp = set([(clear_node(s), clear_node(r), e['color'])
               for (s, r, e) in graph_hypothesis.G.edges(data=True)])
    
    hyp_cleared = []
    for triplet in hyp:
        if triplet[0] != "a" and  triplet[0] != "b" and triplet[1] != "a" and triplet[1] != "b":
            hyp_cleared.append(triplet)
            
    hyp = set(hyp_cleared)
    sim = hyp & prem
    if not sim or len(hyp) == 0:
        return 0
    else:
        #return float(len(sim)) / math.sqrt(len(hyp))
        #return len(sim)
        return float(len(sim)) / len(hyp)

In [None]:
premise = text_to_4lang.process_text("saxophone", method="expand", depth=3, blacklist=["in", "of", "on"])
premise.filter_graph("part")
dot_graph_premise = premise.to_dot()
Source(dot_graph_premise)

In [None]:
text_to_4lang.get_definition("mole")

In [None]:
sherlic_data.iloc[105]

In [None]:
67,68,72,75,76,95,105

In [None]:
graph_premise = text_to_4lang.process_deps(sherlic_data_frame.iloc[67].premise, method="expand", depth=1, blacklist=["in", "on", "of"])
graph_hypothesis = text_to_4lang.process_deps(sherlic_data_frame.iloc[67].hypothesis, method="expand", depth=1)

In [None]:
dot_graph_premise = graph_premise.to_dot()
Source(dot_graph_premise)

In [None]:
dot_graph_premise = graph_hypothesis.to_dot()
Source(dot_graph_premise)

In [None]:
premise = text_to_4lang.process_text("educate", method="expand", depth=1)
dot_graph_premise = premise.to_dot()
Source(dot_graph_premise)

In [None]:
import stanfordnlp

In [None]:
nlp = stanfordnlp.Pipeline() # This sets up a default neural pipeline in English
doc = nlp("A material that may be used as food.")
doc.sentences[0].print_dependencies()

In [None]:
doc