<a href="https://colab.research.google.com/github/carinnech/pydata_bcn_NetworkX/blob/master/MT_biEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install py-babelnet -q
!pip install nltk -q
!pip install datasets -q
!pip install translate -q
!pip install tensorflow tensorflow_hub tensorflow_text -q
!pip install sentencepiece -q
!pip install transformers -q
!pip install sacremoses -q
!pip install bidi -q
!pip install deep_translator -q
!pip install sentence_transformers -q
!pip install unidecode -q

[31mERROR: Could not find a version that satisfies the requirement bidi (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bidi[0m[31m
[0m

In [None]:
# Imports
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import pickle 
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import torch
from google.colab import drive
import random

# BabelNet
import py_babelnet as pb
from py_babelnet.calls import BabelnetAPI

# Corpus imports
from nltk.corpus import wordnet as wn
from datasets import load_dataset_builder, get_dataset_config_names, load_dataset

# Model imports 
from transformers import MarianTokenizer, MarianMTModel, MBartForConditionalGeneration, MBart50TokenizerFast, GenerationConfig
from sentence_transformers import SentenceTransformer, util

# Access to drive
drive.mount('/content/gdrive')
plt.figure(figsize=(20, 20), dpi=500)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


<Figure size 10000x10000 with 0 Axes>

<Figure size 10000x10000 with 0 Axes>

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# Babelnet Class
class BabelNet:

  def __init__(self, searchLang, targetLang):
    # Load Babalnet API
    self.bn = BabelnetAPI('57865ce0-e623-49b7-9b5a-8dbd5596095a')
    self.searchLang = searchLang
    self.targetLang = targetLang
    self.reqNum = 0

    # Load Cache dictionary
    self.cache = self.__load_cache()

  # ------------------------------------ Cache Methods ------------------------------------ #

  # Load cache from babel.pkl file in drive
  def __load_cache(self) -> dict:
    cache = dict()

    try:
      file = open("/content/gdrive/MyDrive/Carinne-Thesis/babel.pkl", "rb")
      cache = pickle.load(file)
      file.close()
    except:
      print("No file babel.pkl. Created file")
      self.__save_cache(cache)

    return cache
  
  # Save updated cache to babel.pkl file in drive
  def __save_cache(self, cache: dict):
    file = open("/content/gdrive/MyDrive/Carinne-Thesis/babel.pkl", "wb")
    pickle.dump(cache, file)
    file.close()

  # Finished with babelnet class, save cache to file
  def save(self):
    print("Number of calls: ", self.reqNum)
    self.__save_cache(self.cache)

  # ---------------------------------------- Word Senses Methods ---------------------------------------- #

  # Get senses for word
  def get_senses_of_word(self, word):
    if (__name__, word) in self.cache:
      senses = self.cache[(__name__, word)]
    else:
      senses = self.bn.get_senses(lemma = word, searchLang = self.searchLang, targetLang = self.targetLang)
      
      # Add number of calls and add to cache dictionary
      self.reqNum += 1
      self.cache[(__name__, word)] = senses

    return [sense for sense in senses]

  # Get senses for word excluding position of word
  def get_senses_of_word_without_pos(self, word, pos):
    senses = self.get_senses_of_word(word)
    return [sense for sense in senses if sense.properties.pos != pos]

  # Get number of senses for word
  def get_senses_num(self, word):
    return len(self.get_senses_of_word(word))

  # ---------------------------------------- Sysnets Methods ---------------------------------------- #

  # Get synsets ids of word
  def get_synset_ids_word(self, word):
    if (__name__, word) in self.cache:
      synsets = self.cache[(__name__, word)]
    else:
      synsets = self.bn.get_synset_ids(lemma = word, searchLang = self.searchLang, targetLang = self.targetLang)
      
      # Add number of calls and add to cache dictionary
      self.reqNum += 1
      self.cache[(__name__, word)] = synsets

    return synsets

  # Get number of babelnet Ids (synsets) for word
  def get_synsets_num(self, word):
    return len(self.get_synset_ids_word(word))

  # Get word synsets Ids and positions of words
  def get_word_bns(self, word):
    babelnet_ids = []
    bnid2pos = []
    synsetids = [] 

    synsetids = self.get_synset_ids_word(word)
    babelnet_ids = [synsetid["id"] for synsetid in synsetids]
    bnid2pos = {synset["id"]:synset["pos"] for synset in synsetids}

    return babelnet_ids, bnid2pos

  # Get specific synset information
  def get_synset_info(self, synset_id):
    if (__name__, synset_id) in self.cache:
      synset = self.cache[(__name__, synset_id)]
    else:
      synset = self.bn.get_synset(id = synset_id, targetLang = self.targetLang)

      # Add number of calls and add to cache dictionary
      self.reqNum += 1
      self.cache[(__name__, synset_id)] = synset

    return synset

  # Get simple lemmas of (senses) in synset, only verb-noun. Input - synsetId
  def get_bn_lemmas_of_synset(self, synsetid, word): 
    senses_list = self.get_synset_info(synsetid)
    
    try:
      senses_list = senses_list["senses"]
      word_capital = word[0].upper() + word[1:] # TODO Should it be removed?
      lemmas = [sense["properties"]["simpleLemma"] for sense in senses_list if (sense["properties"]["pos"]=='VERB' or sense["properties"]["pos"]=='NOUN') and
                                                                              word_capital not in sense["properties"]["simpleLemma"] and
                                                                              sense["properties"]["language"]==bn.targetLang]
    except: 
      print("Synset ID: ",synsetid , "has no senses for word '", word, "'.")
      lemmas = []
    
    return lemmas

  # Get all words from all synsets of word (not unique)
  def get_all_synsets_words(self, word):
    babelnet_ids, bnid2pos = self.get_word_bns(word)
    all_words = []
    for id in babelnet_ids:
      lemmas = self.get_bn_lemmas_of_synset(id, word)
      all_words += lemmas
    
    return all_words

  # ------------------------------------ EXTRA not in use for now ------------------------------------ #

  # Get all hypernyms of synset
  # Input - synsetId, lang
  # Output - list
  def get_hypernyms(self, synsetid, lang):
    hypernyms = [(edge["target"], edge["pointer"]["shortName"]) for edge in self.bn.get_outgoing_edges(id=synsetid)
                    if ((edge["language"] == lang or edge["language"] == "MUL") and edge["pointer"]["relationGroup"] == "HYPERNYM")]
    return hypernyms
    
  # Get all hyponyms of synset
  # Input - synsetId, lang
  # Output - list
  def get_hyponyms(self, synsetid, lang):
    hyponyms = [(edge["target"], edge["pointer"]["shortName"]) for edge in self.bn.get_outgoing_edges(id=synsetid)
                    if ((edge["language"] == lang or edge["language"] == "MUL") and edge["pointer"]["relationGroup"] == "HYPONYM")]
    return hyponyms
    
  # Get all antonym of synset
  # Input - synsetId, lang
  # Output - list
  def get_antonym(self, synsetid, lang):
    antonym = [(edge["target"], edge["pointer"]["shortName"]) for edge in self.bn.get_outgoing_edges(id=synsetid)
                    if ((edge["language"] == lang or edge["language"] == "MUL") and edge["pointer"]["relationGroup"] == "ANTONYM")]
    return antonym

  # Get all other relations of synset
  # Input - synsetId, lang
  # Output - list
  def get_other_relations(self, synsetid, lang):
    others = [(edge["target"], edge["pointer"]["shortName"]) for edge in self.bn.get_outgoing_edges(id=synsetid)
                    if ((edge["language"] == lang or edge["language"] == "MUL") and edge["pointer"]["relationGroup"] == "OTHER")]
    return others

  # Get edges of synset
  # Input - synsetId, lang
  # Output - list
  def get_edges_synset(self, synsetid, lang):
    edges = [edge.target for edge in self.bn.get_outgoing_edges(id=synsetid)
                    if edge.language == lang]
    return edges


In [None]:
class TransModel:

  def __init__(self, src, trg):
    self.model, self.tokenizer = self.__load_cache(src, trg)

  def __load_cache(self, src, trg):
    model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
    model_file_name = f"/content/gdrive/MyDrive/Carinne-Thesis/Helsinki-NLP/opus-mt-{src}-{trg}.pkl"
    tokenizer_file_name = f"/content/gdrive/MyDrive/Carinne-Thesis/Helsinki-NLP/opus-mt-{src}-{trg}_token.pkl"
    model = []
    tokenizer = []

    try:
      with (open(model_file_name, "rb")) as openfile:
        print("Found Model")
        while True:
          try:
            model.append(pickle.load(openfile))
          except Exception as exp:
            break
    except Exception as exp:
      print(exp)
      print(f"Model is downloading")
      model.append(MarianMTModel.from_pretrained(model_name, output_attentions = True))
      
      file = open(model_file_name, "wb")
      pickle.dump(model[0], file)
      file.close()

    # try:
    #   with (open(tokenizer_file_name, "rb")) as openfile:
    #     print("Found Tokenizer")
    #     while True:
    #       try:
    #         tokenizer.append(pickle.load(openfile))
    #       except Exception as exp:
    #         break
    # except Exception as exp:
    print(f"Tokenizer is downloading")
    tokenizer.append(MarianTokenizer.from_pretrained(model_name))

    # file = open(tokenizer_file_name, "wb")
    # pickle.dump(tokenizer[0], file)
    # file.close()

    return model[0], tokenizer[0]


In [None]:
# Set languages of research
src = "es" #bn.searchLang
trg = "he" #bn.targetLang

marianFront = TransModel(src, trg)
marianBack = TransModel(trg, src)
similarity = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

Found Model
Tokenizer is downloading
Found Model
Tokenizer is downloading


In [None]:
# Download configuration from huggingface.co and cache.
gcFront = GenerationConfig.from_pretrained("Helsinki-NLP/opus-mt-es-he")
gcBack = GenerationConfig.from_pretrained("Helsinki-NLP/opus-mt-he-es")

texts = [['Gloria a Ucrania, un saludo patriótico.',
          'Pero con el tiempo y la exposición, Ginsburg dijo que ha desarrollado su tolerancia a la hierba.',
          'La desventaja: en muchas áreas, sigue siendo un mercado de vendedores.',
          'Pie de foto, Homenaje a Flora Tristán en Francia.',
          'De esos casos, 27 la boquilla terminó en la boca de los niños.',
          'Tu inscripción ha sido exitosa.',
          'Guardar tus artículos favoritos.',
          'Powerball histórico: un californiano ganó el premio más grande las loterías en EE.UU.']]
i = 0
for doc in texts:
  print(i)
  if doc is not None:
    for t in doc:

      print("Source: ", t)

      # Forward translate
      batch = marianFront.tokenizer.encode(t, return_tensors = "pt")
      generated_ids = marianFront.model.generate(batch, output_attentions=True, generation_config= gcFront, return_dict_in_generate=True, max_new_tokens = 512)
      heb_sentence = marianFront.tokenizer.batch_decode(generated_ids.sequences, skip_special_tokens = True)[0]

      print("Hebrew translation: ", heb_sentence)

      # Backward Translate
      batch = marianBack.tokenizer.encode(heb_sentence, return_tensors = "pt")
      generated_ids = marianBack.model.generate(batch, output_attentions=True, generation_config=gcBack, return_dict_in_generate=True, max_new_tokens = 512)
      spa_sentence = marianBack.tokenizer.batch_decode(generated_ids.sequences, skip_special_tokens = True)[0]

      print("Spanish translation: ", spa_sentence)

      # How far is the result from the source using similarity model?
      embeddings1 = similarity.encode(t, convert_to_tensor=True)
      embeddings2 = similarity.encode(spa_sentence, convert_to_tensor=True)

      # Compute cosine-similarities
      score = util.cos_sim(embeddings1, embeddings2)

      print("Score: ", score.item(), "\n")

0
Source:  Gloria a Ucrania, un saludo patriótico.
Hebrew translation:  גלוריה לאוקראינה, ברכה פטריוטית.
Spanish translation:  Gloria a Ucrania, una bendición patriótica.
Score:  0.9529401659965515 

Source:  Pero con el tiempo y la exposición, Ginsburg dijo que ha desarrollado su tolerancia a la hierba.
Hebrew translation:  אבל עם הזמן והחשיפה, גינסבורג אמרה שהיא פיתחה סובלנות לגראס.
Spanish translation:  Pero con el tiempo y la exposición, Ginsburg dijo que había desarrollado tolerancia a la hierba.
Score:  0.9928410649299622 

Source:  La desventaja: en muchas áreas, sigue siendo un mercado de vendedores.
Hebrew translation:  החיסרון: באזורים רבים, זה עדיין שוק של אנשי מכירות.
Spanish translation:  La desventaja es que en muchas zonas, sigue siendo un mercado de vendedores.
Score:  0.8015527129173279 

Source:  Pie de foto, Homenaje a Flora Tristán en Francia.
Hebrew translation:  תמונת כף הרגל, הוקרה לפלורה טריסטן בצרפת.
Spanish translation:  La foto de los pies, un homenaje a Flor

In [None]:
bn = BabelNet('ES', 'ES')

In [None]:
# words = ['inscripción', 'dedicación']
# words = ['áreas','zonas']
words = ['saludo','bendición']

In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

words = [stemmer.stem(w) for w in words]

In [None]:
def new_layer(nodes, graph):
  for id in nodes:
    antonyms = [h for h in bn.get_antonym(synsetid=id, lang = src) if h != id]
    for anto in antonyms:
      graph.add_nodes_from([(anto[0], {'lang': bn.targetLang, 'color':'#AAC8Ff2'})], data = True)
      graph.add_edge(id, anto[0], color='blue', key='anto', connectionstyle='arc3, rad = 0', weight = 3, rad=0.1, desc=anto[1])

    hyponyms = [h for h in bn.get_hyponyms(synsetid=id, lang = src) if h != id]
    for hypo in hyponyms:
      graph.add_nodes_from([(hypo[0], {'lang': bn.targetLang, 'color':'#BEF2AA'})], data = True)
      graph.add_edge(id, hypo[0], color='green', key='hypo', connectionstyle='arc3, rad = 0', weight = 2, rad=0.1, desc=hypo[1])

    hypernyms = [h for h in bn.get_hypernyms(synsetid=id, lang = src) if h != id]
    for hyper in hypernyms:
      graph.add_nodes_from([(hyper[0], {'lang': bn.targetLang, 'color':'#F1F2AA'})], data = True)
      graph.add_edge(hyper[0], id, color='yellow', key='hyper', connectionstyle='arc3, rad = 0', weight = 1, rad=0.1, desc=hyper[1])

    others = [h for h in bn.get_other_relations(synsetid=id, lang = src) if h != id]
    for o in others:
      graph.add_nodes_from([(o[0], {'lang': bn.targetLang, 'color':'#FAC2BE'})], data = True)
      graph.add_edge(id, o[0], color='red', key='other', connectionstyle='arc3, rad = 0', weight = 0.5, rad=0.1, desc=o[1])

In [None]:
# Define graph of connections between words and their sysnsets
def graph_en_to_es(words) -> nx.MultiGraph:
  graph = nx.MultiGraph()

  # Get translations for each word in list (from babel) and add to graph
  # for w in random.sample(words, 10):
  for w in words:
    w = w.lower()
    print(w)
    graph.add_nodes_from([(w, {'lang': bn.searchLang, 'color':'#D3D3D3'})])
    synsetIds = set([row.get('id') for row in bn.bn.get_synset_ids(lemma = w, searchLang = 'ES', targetLang = 'ES')])

    for id in synsetIds:
      graph.add_nodes_from([(id, {'lang': bn.targetLang, 'color':'#CBC3E3'})], data = True)
      graph.add_edge(w, id, color='gray', key='babel', connectionstyle='arc3, rad = 0', weight = 4, rad=0.1, desc="root")

      new_layer([id], graph)
        
  return graph

In [None]:
main_graph = graph_en_to_es(words)
main_graph_depth = 2

from networkx.drawing.layout import bipartite_layout
from networkx import bipartite
plt.rcParams["figure.figsize"] = (20,40)

# Print graph
def print_graph(G, save, file):
  pos = nx.bipartite_layout(G, words, scale=2)
  edges = G.edges(data=True)
  colors_edge = nx.get_edge_attributes(G,'color').values()
  colors_node = nx.get_node_attributes(G,'color').values()
  edge_styles = nx.get_edge_attributes(G,'connectionstyle').values()
  edge_weights = list(nx.get_edge_attributes(G,'weight').values())

  nx.draw(G, node_color = colors_node, edge_color = colors_edge, node_size=200, width = edge_weights, with_labels = True)
  
  if save:
    plt.savefig(file, format="PNG")
  plt.show()

print_graph(main_graph, True, "main.png")

saludo


AttributeError: ignored

In [None]:
# while not nx.has_path(main_graph, words[0], words[1]):
#   # Get leaves of graph, last layer
#   nodes = set(node for node, distance in nx.shortest_path_length(main_graph, words[0]).items() if distance == main_graph_depth)
#   new_layer(nodes = nodes, graph = main_graph)
#   main_graph_depth += 1
#   pass

In [None]:
# print_graph(main_graph, True, "main.png")

In [None]:
# nx.shortest_path_length(main_graph, words[0]).items()

In [None]:
# Number of same arc type from node 
def num_same_arcs(node, arc_type_r):
  return sum(1 if next(iter(main_graph.get_edge_data(e[0], e[1]).keys())) == arc_type_r else 0 for e in iter(main_graph.edges(node)))  

# Node depth
def calc_node_depth(node):
  return min(nx.shortest_path_length(main_graph, source = node, target = r) for r in words)

# Edge weight
def calc_edge_weight(node_start, node_end):
  # Get edge type
  edge_data = main_graph.get_edge_data(node_start, node_end)
  edge_typeR = next(iter(edge_data.keys()))

  if edge_typeR == "anto":
    weight = 2.5
  elif edge_typeR == "babel":
    weight = 0
  else:
    maxR = 2
    minR = 1

    # Count number of same arcs for start node
    num_arc_start = num_same_arcs(node_start, edge_typeR)
    num_arc_end = num_same_arcs(node_end, edge_typeR)

    # Get number of same arcs 
    weight_start = maxR - ((maxR - minR)/num_arc_start)
    weight_end = maxR - ((maxR - minR)/num_arc_end)

    weight = (weight_start + weight_end) /2.0

  return weight

# Path weight
def distance(node_start, node_end):
  path = nx.shortest_path(main_graph,node_start,node_end)
  total_weight = 0

  for i in range(len(path)-1):
    total_weight += calc_edge_weight(path[i], path[i+1])

  return total_weight

In [None]:
distance(words[0],words[1])

In [None]:
[n for n,d in G.in_degree() if d==0]

In [None]:
# Get shortest path and calculate weight

In [None]:
# graph_add_relation_edges(graph) 00085163v  00088223v
path = nx.shortest_path(main_graph,'pero','gato')

for i in range(len(path)-1):
  print(path[i], path[i+1])

In [None]:
nx.shortest_path_length(main_graph, source='bn:00076248n', target='bn:01686524n')

In [None]:
path

In [None]:
main_graph.get_edge_data('bn:00041739n', 'bn:00041739n')

In [None]:
# bn.bn.get_synset_ids(lemma = 'saludo', searchLang = 'ES', targetLang = 'ES')

In [None]:
bn.get_other_relations(synsetid='bn:00041739n', lang='ES')[0][1]

In [None]:
bn.bn.get_outgoing_edges(id='bn:00041739n')

In [None]:
bn.bn.get_synset(id = "bn:00067050n", targetLang = "ES")

In [None]:
for edge in bn.bn.get_outgoing_edges(id="bn:00088223v"):
  print(edge)
  

In [None]:
for row in bn.bn.get_outgoing_edges(id='bn:00041739n'):
  print(row["pointer"]["relationGroup"])

In [None]:
bn.bn.get_synset(id = 'bn:00041739n', targetLang = 'ES')

In [None]:
for row in bn.bn.get_synset_ids(lemma = 'salute', searchLang = 'ES', targetLang = 'ES'):
  print(row.get('id'))