In [None]:
from fourlang.text_to_4lang import TextTo4lang
from fourlang.lexicon import Lexicon
from graphviz import Source
from scripts.parse_data import read_sherliic, build_graph
from scripts.similarity import Similarity

from tqdm import tqdm
preds = []
text_to_4lang = TextTo4lang(lang="en")
data = read_sherliic("data/dev.csv", ud_path="data/relation_index.tsv", keep_context=True)
data_frame = build_graph(data)

lexicon = Lexicon(lang="en")

similarity = Similarity(with_embedding=False)

In [None]:
data['premise_text'] = data["prem_argleft"] + " " + data["premise"] + " " + data["prem_argright"]

In [None]:
data['hyp_text'] = data["hypo_argleft"] + " " + data["hypothesis"] + " " + data["hypo_argright"]

In [None]:
data_frame

In [None]:
premise = text_to_4lang.process_text("An attempt to cause damage, injury to, or death of opponent or enemy.", method="expand", depth=0)
dot_graph_premise = premise.to_dot()
Source(dot_graph_premise)

In [None]:
import math

def asim_jac_edges(graph_premise, graph_hypothesis):
    """
    Asymmetric Jaccard similarity between the edges of the definition graphs
    :param graph_premise: the definition graph of the premise
    :param graph_hypothesis: the definition graph of the hypothesis
    :return: the ratio of overlapping edges per the length of the hypothesis definition
    """
    prem = set([(clear_node(s), clear_node(r), e['color'])
                for (s, r, e) in graph_premise.G.edges(data=True)])
    hyp = set([(clear_node(s), clear_node(r), e['color'])
               for (s, r, e) in graph_hypothesis.G.edges(data=True)])
    
    hyp_cleared = []
    for triplet in hyp:
        if triplet[0] != "A" and  triplet[0] != "B" and triplet[1] != "A" and triplet[1] != "B":
            hyp_cleared.append(triplet)
            
    hyp = set(hyp_cleared)
    sim = hyp & prem
    if not sim or len(hyp) == 0:
        return 0
    else:
        #return float(len(sim)) / math.sqrt(len(hyp))
        #return len(sim)
        return float(len(sim)) / len(hyp)

In [None]:
def asim_jac_nodes(graph_premise, graph_hypothesis):
    """
    Asymmetric Jaccard similarity between the nodes of the definition graphs
    :param graph_premise: the definition graph of the premise
    :param graph_hypothesis: the definition graph of the hypothesis
    :return: the ratio of overlapping nodes per the length of the hypothesis definition
    """
    prem = set([clear_node(node) for node in graph_premise.G.nodes])
    hyp = set([clear_node(node) for node in graph_hypothesis.G.nodes])
    
    hyp_cleared = []
    for triplet in hyp:
        if triplet != "a" and  triplet != "b" and triplet != "a" and triplet != "b":
            hyp_cleared.append(triplet)
            
    hyp = set(hyp_cleared)
    sim = hyp & prem
    if not sim or len(hyp) == 0:
        return 0
    else:
        return float(len(sim)) / len(hyp)

In [None]:
import re
def clear_node(node):
    """
    Clears the node from the 4lang id parts
    :param node: the text to clear
    :return: the cleared text
    """
    return re.sub(r'_[0-9][0-9]*', '', node)

In [None]:
preds = []
guesses = []
for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame["premise"][index]
    hypothesis = data_frame["hypothesis"][index]
    score = data.score[index]
    graph_premise = text_to_4lang.process_deps(premise, method="expand", depth=2, blacklist=["in", "on", "of"], filt=False, black_or_white="")
    graph_hypothesis = text_to_4lang.process_deps(hypothesis, method="expand", depth=1, blacklist=["in", "on", "of"], filt=False, black_or_white="")
    pred = asim_jac_edges(graph_premise, graph_hypothesis)
    guesses.append(pred)
    if pred >= 0.1:
        preds.append(1)
    else:
        preds.append(0)

In [None]:
from sklearn.metrics import precision_recall_fscore_support as pr
bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), preds)

print("Precision: " +  str(bPrecis[1]))
print("Recall: " +  str(bRecall[1]))
print("Fscore: " +  str(bFscore[1]))

In [None]:
text_to_4lang.get_definition("overtake")

In [None]:
guesses.mean()

In [None]:
gold = data_frame.score.tolist()

for i, score in enumerate(gold):
    if preds[i] == 1 and score == 0:
        print(i)

In [None]:
data.iloc[74]

In [None]:
graph_premise = text_to_4lang.process_deps(data_frame.iloc[74].premise, method="expand", depth=3, blacklist=["in", "on", "of"])
graph_hypothesis = text_to_4lang.process_deps(data_frame.iloc[74].hypothesis, method="expand", depth=1, black_or_white="black")

In [None]:
dot_graph_premise = graph_premise.to_dot()
Source(dot_graph_premise)

In [None]:
dot_graph_premise = graph_hypothesis.to_dot()
Source(dot_graph_premise)

In [None]:
asim_jac_edges(graph_premise, graph_hypothesis)

In [None]:
text_to_4lang.get_definition("score")

In [None]:
67,68,72,75,76,95

In [None]:
import numpy
print("Printing float range with numpy.arange()")

thresholds = []
print("Example one")
for i in numpy.arange(0, 1, 0.05):
    thresholds.append(i)
thresholds.append(1.0)
thresholds

In [None]:
import numpy
print("Printing float range with numpy.arange()")

thresholds = []
print("Example one")
for i in numpy.arange(0, 10, 0.5):
    thresholds.append(i)

In [None]:
precisions = []
recals = []
f1_scores = []
yields = []

for thresh in thresholds:
    preds = []
    for score in guesses:
        if float(score) >= thresh:
            preds.append(1)
        else:
            preds.append(0)
    p = pr(data_frame.score.tolist(), preds)
    precisions.append(p[0][1])
    recals.append(p[1][1])
    f1_scores.append(p[2][1])
    yields.append(preds.count(1))


In [None]:
from matplotlib import pyplot as plt
import matplotlib

#Plotting to our canvas
fig = plt.figure(figsize=(10, 5), dpi=100)
plt.plot(thresholds,precisions,label='precision',linewidth=3)
plt.plot(thresholds,recals,label='recal',linewidth=3)
plt.plot(thresholds,f1_scores,label='f1_score',linewidth=3)

plt.xticks([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
plt.yticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
#plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

plt.legend()

plt.grid()
plt.xlabel('Threshold')
plt.ylabel('Prediction')
plt.title("2-1 expand n/N nodes")
matplotlib.rcParams.update({'font.size': 18})
#Showing what we plotted, we can see we achieved pretty good values with ~0,62 f1_score and accuracy
#Interesting thing to notice is that if we had some edge similarity, raising the threshold value doesnt change the result.
plt.show()

In [None]:
from matplotlib import pyplot as plt
import matplotlib

#Plotting to our canvas
fig = plt.figure(figsize=(10, 5), dpi=100)
plt.plot(thresholds,yields,label='yield',linewidth=3)

#plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
#plt.xticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
plt.yticks([50, 300, 1000])
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

plt.legend()

plt.grid()
plt.xlabel('Threshold')
plt.ylabel('Prediction')
plt.title("4-2 expand n/N yields")
matplotlib.rcParams.update({'font.size': 18})
#Showing what we plotted, we can see we achieved pretty good values with ~0,62 f1_score and accuracy
#Interesting thing to notice is that if we had some edge similarity, raising the threshold value doesnt change the result.
plt.show()

In [None]:
data_frame.score.tolist().count(1)

In [None]:
with open("state", "r+") as f:
    dist_scores = []
    next(f)
    for line in f:
        line = line.strip("\n").split("\t")
        pred = line[2]
        dist_scores.append(1 if pred=="True" else 0)

In [None]:
bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), dist_scores)

print("Precision: " +  str(bPrecis[1]))
print("Recall: " +  str(bRecall[1]))
print("Fscore: " +  str(bFscore[1]))

In [None]:
guesses = []
for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame["premise"][index]
    hypothesis = data_frame["hypothesis"][index]
    if preds[i] == 1 or dist_scores[i] == 1:
        guesses.append(1)
    else:
        guesses.append(0)

In [None]:
bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), guesses)

print("Precision: " +  str(bPrecis[1]))
print("Recall: " +  str(bRecall[1]))
print("Fscore: " +  str(bFscore[1]))

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
for i in data_frame.iloc[2].premise[0]:
    if i[0] == "root":
        print(i[2][0])

In [None]:
preds = []
for j in tqdm(range(len(data_frame))):
    index = j
    
    for i in data_frame.iloc[index].premise[0]:
        if i[0] == "root":
            premise = i[2][0]
            
    for i in data_frame.iloc[index].hypothesis[0]:
        if i[0] == "root":
            hypothesis = i[2][0]
    
    score = data_frame.score[index]
    
    hyp_syn_names_all = []
    hyper_premise_names_all = []
    
    premise_syns = wn.synsets(premise)
    hyp_syns = wn.synsets(hypothesis)
    """
    if len(premise_syns) > 0 and len(hyp_syns) > 0:
        en_premise = premise_syns[0].lemmas()[0].name()
        en_hyp = hyp_syns[0].lemmas()[0].name()
        fourlang_score = get_4lang_score(en_premise, en_hyp)
    else:
        fourlang_score = 0
    """
    
    
    for premise_syn in premise_syns:

        hyperpremise = set([i for i in premise_syn.closure(lambda s:s.hypernyms())])

        hyper_premise_lemmas = []
        for i in hyperpremise:
            lemmas = i.lemmas()
            for lemm in lemmas:
                hyper_premise_lemmas.append(lemm)

        hyper_premise_names = set([i.name() for i in hyper_premise_lemmas])
        hyper_premise_names_all += list(hyper_premise_names)
        
    for hyp_syn in hyp_syns:
        hyp_syn_lemmas = hyp_syn.lemmas()
        hyp_syn_names = set([i.name() for i in hyp_syn_lemmas])
        
        hyp_syn_names_all += list(hyp_syn_names)
    
    if (set(hyp_syn_names_all) & set(hyper_premise_names_all)):
        preds.append(1)
    else:
        preds.append(0)

In [None]:
from sklearn.metrics import precision_recall_fscore_support as pr
bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), preds)

print("Precision: " +  str(bPrecis[1]))
print("Recall: " +  str(bRecall[1]))
print("Fscore: " +  str(bFscore[1]))

In [None]:
from nltk.corpus import stopwords as nltk_stopwords
