In [1]:
import sys

sys.path.insert(0,'../../src')

from hu_nmt.data_augmentator.dependency_parsers.spacy_dependency_parser import SpacyDependencyParser
from hu_nmt.data_augmentator.dependency_parsers.stanza_dependency_parser import StanzaDependencyParser

from hu_nmt.data_augmentator.wrapper.dependency_graph_wrapper import DependencyGraphWrapper
from hu_nmt.data_augmentator.augmentators.subject_object_augmentator import SubjectObjectAugmentator

from hu_nmt.data_augmentator.graph_mappers.ged import GED
from hu_nmt.data_augmentator.graph_mappers.edge_mapper import EdgeMapper

import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from networkx import optimize_graph_edit_distance, graph_edit_distance,optimize_edit_paths
import matplotlib.pyplot as plt

In [2]:
hun_dep_parser = SpacyDependencyParser(lang='hu')
eng_dep_parser = StanzaDependencyParser('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-06-02 12:02:12 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-06-02 12:02:12 INFO: Use device: cpu
2022-06-02 12:02:12 INFO: Loading: tokenize
2022-06-02 12:02:12 INFO: Loading: pos
2022-06-02 12:02:12 INFO: Loading: lemma
2022-06-02 12:02:12 INFO: Loading: depparse
2022-06-02 12:02:12 INFO: Done loading processors!


In [3]:
ged = GED('hu', 'en')
edge_mapper = EdgeMapper('hu', 'en')

In [2]:
hu_sent_file = 'data/cands/cands.hu'
en_sent_file = 'data/cands/cands.en'


with open(hu_sent_file, 'r') as f:
    hu_sents = f.readlines()
    hu_sents = [s.rstrip() for s in hu_sents]
with open(en_sent_file, 'r') as f:
    en_sents = f.readlines()
    en_sents = [s.rstrip() for s in en_sents]  

In [65]:
def check_if_augmentable(hu_graph1, en_graph1, hu_graph2, en_graph2):
    if SubjectObjectAugmentator.is_eligible_for_augmentation(hu_graph1, en_graph1, 'nsubj') and \
        SubjectObjectAugmentator.is_eligible_for_augmentation(hu_graph2, en_graph2, 'nsubj'):
        return 'nsubj'
    if SubjectObjectAugmentator.is_eligible_for_augmentation(hu_graph1, en_graph1, 'obj') and \
        SubjectObjectAugmentator.is_eligible_for_augmentation(hu_graph2, en_graph2, 'obj'):
        return 'obj'
    return None

def try_augmentation():
    while True:
        (idx1, idx2) = np.random.choice(len(hu_sents), 2, replace=False)

        # Graphs

        hu_sent1 = hu_sents[idx1]
        en_sent1 = en_sents[idx1]

        hu_sent2 = hu_sents[idx2]
        en_sent2 = en_sents[idx2]

        hu_graph1 = hun_dep_parser.sentence_to_graph_wrapper(hu_sent1)
        en_graph1 = eng_dep_parser.sentence_to_graph_wrapper(en_sent1)

        hu_graph2 = hun_dep_parser.sentence_to_graph_wrapper(hu_sent2)
        en_graph2 = eng_dep_parser.sentence_to_graph_wrapper(en_sent2)

        dep = check_if_augmentable(hu_graph1, en_graph1, hu_graph2, en_graph2)

        if dep is not None:
            break
    
    # hu_graph1.display_graph()
    # en_graph1.display_graph()
    # hu_graph2.display_graph()
    # en_graph2.display_graph()
    

    # Similarity
    
    hu_dist = ged.get_normalized_distance(hu_graph1.graph, hu_graph2.graph)
    en_dist = ged.get_normalized_distance(en_graph1.graph, en_graph2.graph)
    
    hu_jaccard = edge_mapper.get_jaccard_index(hu_graph1.graph, hu_graph2.graph)
    en_jaccard = edge_mapper.get_jaccard_index(en_graph1.graph, en_graph2.graph)
    
    # Augmentation
    
    augmentator = SubjectObjectAugmentator(None, None, 0, 0, [], '', '')
    
    new_hu_sents = augmentator.swap_subtrees(hu_graph1, hu_graph2, dep)
    new_en_sents = augmentator.swap_subtrees(en_graph1, en_graph2, dep)
    
    
    # Check augmentation
    
    aug_hu_graph1 = hun_dep_parser.sentence_to_graph_wrapper(new_hu_sents[0])
    aug_en_graph1 = eng_dep_parser.sentence_to_graph_wrapper(new_en_sents[0])

    aug_hu_graph2 = hun_dep_parser.sentence_to_graph_wrapper(new_hu_sents[1])
    aug_en_graph2 = eng_dep_parser.sentence_to_graph_wrapper(new_en_sents[1])
    
    aug_hu1_dist = ged.get_normalized_distance(hu_graph1.graph, aug_hu_graph1.graph)
    aug_en1_dist = ged.get_normalized_distance(en_graph1.graph, aug_en_graph1.graph)
    aug_hu2_dist = ged.get_normalized_distance(hu_graph2.graph, aug_hu_graph2.graph)
    aug_en2_dist = ged.get_normalized_distance(en_graph2.graph, aug_en_graph2.graph)
    
    aug_hu1_jaccard = edge_mapper.get_jaccard_index(hu_graph1.graph, aug_hu_graph1.graph)
    aug_en1_jaccard = edge_mapper.get_jaccard_index(en_graph1.graph, aug_en_graph1.graph)
    aug_hu2_jaccard = edge_mapper.get_jaccard_index(hu_graph2.graph, aug_hu_graph2.graph)
    aug_en2_jaccard = edge_mapper.get_jaccard_index(en_graph2.graph, aug_en_graph2.graph)
    
    # Printing
    print('\n-----------Original-----------\n')
    print(f'Hu-1: {hu_sent1}\nEn-1{en_sent1}\n')
    print(f'Hu-2: {hu_sent2}\nEn-2{en_sent2}')
    
    print('\n-----------Similarity-----------\n')
    print(f'Hu norm ged: {hu_dist}')
    print(f'En norm ged: {en_dist}\n')
    
    print(f'Hu jaccard: {hu_jaccard}')
    print(f'En jaccard: {en_jaccard}\n')
    
    print('\n-----------Augmentation-----------\n')
    print(f'{new_hu_sents[0]}\n{new_en_sents[0]}\n')
    print(f'{new_hu_sents[1]}\n{new_en_sents[1]}')
    
    print('\n-----------Check Augmentation-----------\n')
    print(f'Hu-1\n\tnorm ged: {aug_hu1_dist}\n\tjaccard: {aug_hu1_jaccard}')
    print(f'Hu-2\n\tnorm ged: {aug_hu2_dist}\n\tjaccard: {aug_hu2_jaccard}')
    print(f'En-1\n\tnorm ged: {aug_en1_dist}\n\tjaccard: {aug_en1_jaccard}')
    print(f'En-2\n\tnorm ged: {aug_en2_dist}\n\tjaccard: {aug_en2_jaccard}')
    

In [68]:
try_augmentation()

AAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAA

-----------Original-----------

Hu-1: Az a haderő, amelynek nincsen külső célpontja, mindig a saját népe ellen fordul.
En-1The military force which is denied an external target always turns against its own people.

Hu-2: Az ifjú elnyújtózott a padkán egy vadalmafa alatt.
En-2The youth lay down on the bench, under a wild apple-tree.

-----------Similarity-----------

Hu norm ged: 0.6
En norm ged: 0.6129032258064516

Hu jaccard: 0.47058823529411764
En jaccard: 0.34782608695652173


-----------Augmentation-----------

Az a haderő , amelynek nincsen Az ifjú , mindig a saját népe ellen fordul .
the youth always turns against its own people .

külső célpontja elnyújtózott a padkán egy vadalmafa alatt .
the military force which is denied an external target lay down on the bench , under a wild apple - tree .

-----------Check Augmentation-----------

Hu-1
	norm 