In [1]:
import re
import networkx as nx
from networkx.readwrite import json_graph
from fourlang.text_to_4lang import TextTo4lang
from graphviz import Source
from scripts.parse_data import read
from scripts.similarity import Similarity
from tqdm import tqdm

In [2]:
data_frame = read("en", graded=False)
similarity = Similarity()

### Simple monolingual dictionary based baseline based on bag-of-words

In [12]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    premise_definition = text_to_4lang.get_definition(premise) #legyen-e expand
    hypothesis_definition = text_to_4lang.get_definition(hypothesis)
    if premise_definition is None:
        premise_definition = premise
    if hypothesis_definition is None:
        hypothesis_definition = hypothesis
        
    pred = similarity.asim_jac_words(premise_definition, hypothesis_definition)
    if pred > 0:
        preds.append(1)
    else:
        preds.append(0)


  0%|          | 0/316 [00:00<?, ?it/s][A
  1%|          | 2/316 [00:00<00:16, 19.58it/s][A
  1%|▏         | 4/316 [00:00<00:16, 19.48it/s][A
  2%|▏         | 7/316 [00:00<00:14, 21.35it/s][A
  3%|▎         | 10/316 [00:00<00:13, 23.00it/s][A
  4%|▍         | 13/316 [00:00<00:13, 23.02it/s][A
  5%|▌         | 16/316 [00:00<00:12, 23.50it/s][A
  6%|▌         | 19/316 [00:00<00:12, 24.47it/s][A
  7%|▋         | 23/316 [00:00<00:11, 26.14it/s][A
  8%|▊         | 26/316 [00:01<00:10, 26.92it/s][A
  9%|▉         | 29/316 [00:01<00:12, 23.41it/s][A
 10%|█         | 32/316 [00:01<00:11, 24.55it/s][A
 11%|█         | 35/316 [00:01<00:11, 25.18it/s][A
 12%|█▏        | 38/316 [00:01<00:10, 25.96it/s][A
 13%|█▎        | 41/316 [00:01<00:10, 26.71it/s][A
 14%|█▍        | 44/316 [00:01<00:10, 26.10it/s][A
 15%|█▍        | 47/316 [00:01<00:10, 26.45it/s][A
 16%|█▌        | 50/316 [00:02<00:11, 23.62it/s][A
 17%|█▋        | 53/316 [00:02<00:11, 22.27it/s][A
 18%|█▊        | 56/316

### Simple monolingual dictionary based baseline using 4lang expand

In [3]:
text_to_4lang = TextTo4lang(lang="en")

In [6]:
preds = []

for i in tqdm(range(len(data_frame))):
    index = i
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    graph_premise = text_to_4lang.process_text(premise, True) #legyen-e expand
    graph_hypothesis = text_to_4lang.process_text(hypothesis, True)
    dot_graph_premise = graph_premise.to_dot()
    dot_graph_hypothesis = graph_hypothesis.to_dot()
    pred = similarity.asim_jac_nodes(graph_premise, graph_hypothesis)
    if pred > 0:
        preds.append(1)
    else:
        preds.append(0)

100%|██████████| 316/316 [06:55<00:00,  1.34s/it]


In [7]:
with open("result_binary", "w+") as f:
    for i,pred in enumerate(preds):
        premise = data_frame.premise[i]
        hypothesis = data_frame.hypothesis[i]
        f.write(premise + " " + hypothesis + " " + str(pred) + "\n")

In [37]:
g = text_to_4lang.process_text("enemy", expand=True)

In [38]:
g.get_nodes()

['enemy', 'relate', 'of']

In [8]:
text_to_4lang.get_definition("dog")

'A mammal, Canis lupus familiaris, that has been domesticated for thousands of years, of highly variable appearance due to human breeding.'