In [0]:
import time
import multiprocessing

from lxml.etree import iterparse, tostring

## Download datasets

In [0]:
# Mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# Download datasets
# !wget 'http://lcl.uniroma1.it/eurosense/data/eurosense.v1.0.high-precision.tar.gz'
# !wget 'http://lcl.uniroma1.it/sew/data/sew_conservative.tar.gz'
# !wget 'http://trainomatic.org/data/train-o-matic-data.zip'

# !cp 'eurosense.v1.0.high-precision.tar.gz' '/content/gdrive/My Drive/eurosense.v1.0.high-precision.tar.gz' 
# !cp 'sew_conservative.tar.gz' '/content/gdrive/My Drive/sew_conservative.tar.gz' 
# !cp 'train-o-matic-data.zip' '/content/gdrive/My Drive/train-o-matic-data.zip'

In [0]:
# Copy file from google drive to local
# !cp '/content/gdrive/My Drive/eurosense.v1.0.high-precision.tar.gz' 'eurosense.v1.0.high-precision.tar.gz'
!cp '/content/gdrive/My Drive/sew_conservative.tar.gz' 'sew_conservative.tar.gz'
# !cp '/content/gdrive/My Drive/train-o-matic-data.zip' 'train-o-matic-data.zip'

# Unzip datasets
# !tar -xvf '/content/gdrive/My Drive/eurosense.v1.0.high-precision.tar.gz'
!tar -xf 'sew_conservative.tar.gz'
# !unzip -q 'train-o-matic-data.zip'

## Parse Datasets

In [0]:
def get_bn2wn_mapping(path):
  """
  Returns a dictionary with a mapping between
    BabelNet synsets and WordNet synsets
  """
  bn2wn = dict()

  with open(path) as f:
    for line in f:
      # TODO: check the line with 3 entries
      bn, wn = line.strip().split()[:2]
      bn2wn[bn] = wn
  
  return bn2wn

bn2wn = get_bn2wn_mapping('/content/gdrive/My Drive/bn2wn_mapping.txt')

In [0]:
import string
import re

def process_text(s):
  """
  Removes punctuation and multiple consecutive
    spaces from text
  """
  # remove punctuation characters
  s = s.translate(
     str.maketrans('', '', string.punctuation))
  # remove multiple consecutive spaces
  s = re.sub(' +', ' ', s)
  
  return s.lower()

### Parse Eurosense

In [0]:
def get_longest_lemma_from_anchor(lemm_anchor, lemmas):
  """
  Returns the longest lemma containing the `anchor`
    string. According to high precision specification of Eurosense.
  """
  relevant_lemmas = list(filter(lambda x: lemm_anchor in x, lemmas))
  longest_lemma = max(relevant_lemmas, key=len)
  
  return longest_lemma

In [0]:
def parse_eurosense(xml_path):
  sentences = []
  
  context = iterparse(xml_path, events=('start', 'end'))
  for idx, (_, element) in enumerate(context):
    # if valid sentence
    if element.tag == 'sentence' and 'id' in element.attrib:
      # get english text
      eng = element.xpath('text[@lang="en"]')
      
      if not eng or not eng[0].text: continue
      
      sentence = process_text(eng[0].text)

      # get english annotations
      annotations = element.xpath('annotations/annotation[@lang="en"]')
      anchor2lemma, lemma2synset = dict(), dict()

      # extract lemma_synset pairs
      for child in annotations:
        bn = child.text
        anchor = process_text(child.get('anchor').lower())

        if bn in bn2wn and anchor:
          lemma = '_'.join(child.get('lemma').split()).lower()
          anchor2lemma[anchor] = lemma
          lemma2synset[lemma] = bn2wn[bn]

      # replace annotated anchors with lemma_synset pair
      sorted_anchors = sorted(anchor2lemma.keys(), key=len, reverse=True)
      for i, anchor in enumerate(sorted_anchors):
        # check if current anchor was contained in a bigger anchor before
        if anchor not in ' '.join(sorted_anchors[:i]):
          lemm_anchor = anchor2lemma[anchor]
          longest_lemma = get_longest_lemma_from_anchor(lemm_anchor, anchor2lemma.values())
          synset = lemma2synset[longest_lemma]

          old = r'\b{}\b'.format(anchor)
          new = '{}_{}'.format(longest_lemma, synset)
          sentence = re.sub(old, new, sentence)
      
      sentences.append(sentence.lower().split())
      
    element.clear()
  
  return sentences

xml_path = 'EuroSense/eurosense.v1.0.high-precision.xml'
tik = time.time()
eurosense_sents = parse_eurosense(xml_path)
tok = time.time()
print('Parsing eurosense: {} minutes'.format((tok - tik) / 60))

In [0]:
with open('eurosense_parsed_sentences.txt', 'w') as f:
  for sent in eurosense_sents:
    str_sent = ' '.join(sent)
    f.write(str_sent + '\n')
    
!cp 'eurosense_parsed_sentences.txt' '/content/gdrive/My Drive/eurosense_parsed_sentences.txt'

In [0]:
!cp '/content/gdrive/My Drive/eurosense_parsed_sentences.txt' 'eurosense_parsed_sentences.txt'

eurosense_sents = []
with open('eurosense_parsed_sentences.txt') as f:
  for line in f:
    eurosense_sents.append(line.strip().split())

### Parse SEW

In [0]:
!cp '/content/gdrive/My Drive/sample.xml' 'sample.xml'

In [0]:
from glob import iglob

def parse_sew():
  sentences = []

  for i, xml in enumerate(iglob('sew_conservative/*/*.xml')):
    # extract first 4M sentences
    if len(sentences) > 4_000_000:
      print(i)
      break

    context = iterparse(xml, events=('start', 'end'))
    for idx, (_, element) in enumerate(context):

      if element.tag.lower() == 'wikiarticle':
        # article text
        article_text = process_text(element.xpath('text')[0].text)

        # all annotations
        annotations = element.xpath('annotations/annotation')
        for child in annotations:
          bn = child.xpath('babelNetID')
          mention = child.xpath('mention')
          if not bn or not mention or not mention[0].text:
            continue

          bn = bn[0].text
          if bn in bn2wn:
            anchor = process_text(mention[0].text)
            new_anchor = '_'.join(anchor.split())

            # this replacement technique works 90% of the time
            old = r'\b{}\b'.format(anchor)
            new = '{}_{}'.format(new_anchor, bn2wn[bn])
            article_text = re.sub(old, new, article_text, count=1)
            
        # randomly pick 20% of article sentences
        article_sents = article_text.split('\n')
        selected_sents = random.sample(article_sents, int(0.2 * len(article_sents)))
        
        for s in selected_sents:
          sentences.append(s.split())

        # only need one article element
        element.clear()
        break

  return sentences

tik = time.time()
sew_sents = parse_sew()
tok = time.time()
print('Parsing SEW: {} minutes'.format((tok - tik) / 60))

In [0]:
with open('sew_parsed_sentences', 'w') as f:
  for sent in sew_sents:
    str_sent = ' '.join(sent)
    f.write(str_sent + '\n')
    
!cp 'sew_parsed_sentences.txt' '/content/gdrive/My Drive/sew_parsed_sentences.txt'

In [0]:
!cp '/content/gdrive/My Drive/sew_parsed_sentences.txt' 'sew_parsed_sentences.txt'

sew_sents = []
with open('sew_parsed_sentences.txt') as f:
  for line in f:
    sew_sents.append(line.strip().split())

### Parse Train-O-Matic

In [0]:
from glob import iglob

def parse_trainomatic(path):
  sentences = []

  for i, xml in enumerate(iglob(path)):
    
    context = iterparse(xml, events=('start', 'end'))
    for idx, (_, element) in enumerate(context):
      
      if element.tag.lower() == 'corpus':
        child = element.xpath('lexelt')
        
        if not child: continue

        lemma = child[0].get('item').split('.')[0]
        instances = child[0].xpath('instance')
        
        for ins in instances:
          answer, context = ins.xpath('answer'), ins.xpath('context')
          
          if answer and context:
            wn = ins.xpath('answer/@senseId')[0].split(':')[1]
            pair = '{}_{}'.format(lemma, wn)

            sentence = tostring(context[0], method='text', encoding=str).lower()
            sentence = sentence.replace(lemma, pair)

            sentences.append(sentence.split())

      # only one corpus
      element.clear()
      break
      
  return sentences

      
trainomatic_sents = parse_trainomatic('TRAIN-O-MATIC-DATA/EN/EN.500-2.0/*.xml')

In [0]:
with open('trainomatic_parsed_sentences.txt', 'w') as f:
  for sent in trainomatic_sents:
    str_sent = ' '.join(sent)
    f.write(str_sent + '\n')
    
!cp 'trainomatic_parsed_sentences.txt' '/content/gdrive/My Drive/trainomatic_parsed_sentences.txt'

In [0]:
!cp '/content/gdrive/My Drive/trainomatic_parsed_sentences.txt' 'trainomatic_parsed_sentences.txt'

trainomatic_sents = []
with open('trainomatic_parsed_sentences.txt') as f:
  for line in f:
    trainomatic_sents.append(line.strip().split())

## Word2Vec Model

In [0]:
def filter_sense_embeddings(path):
  """
  Removes word embeddings from a word2vec
    formatted embeddings file
  """
  senses = []
  with open(path, 'r') as f:
      for line in f:
          key = line.split(' ', 1)[0]
          if '_' in key:
              senses.append(line)

  with open(path, 'w') as f:
      f.write("{} {}\n".format(len(senses), len(senses[0].split(' ')) - 1))
      for sense in senses:
          file.write(sense + '\n')

In [0]:
from gensim.models import Word2Vec

train_sents = eurosense_sents + sew_sents + trainomatic_sents

model = Word2Vec(size=400, window=10, sample=10e-5, 
         workers=multiprocessing.cpu_count(), hs=1, negative=0, 
         iter=15, compute_loss=True)
model.build_vocab(train_sents)

model.train(train_sents, total_examples=model.corpus_count,
        epochs=model.iter, compute_loss=model.compute_loss)
model.wv.save_word2vec_format('embeddings.vec', binary=False)
filter_sense_embeddings('embeddings.vec')

model = model.wv

!cp 'embeddings.vec' '/content/gdrive/My Drive/embeddings.vec'

In [0]:
from gensim.models import KeyedVectors

!cp '/content/gdrive/My Drive/embedding_0.22_eurosense_0.2sew.vec' 'embeddings.vec'

model = KeyedVectors.load_word2vec_format('embeddings.vec', binary=False)

## Word Similarity

### Download dataset

In [0]:
!wget 'http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/wordsim353.zip'

!unzip -q 'wordsim353.zip' -d 'wordsim353'

### Computation

In [0]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

In [0]:
def similarity_measure(s1, S1, s2, S2):
  return cosine_similarity(s1, s2)
  # return weighted_cosine_similarity(s1, S1, s2, S2)
  
def cosine_similarity(s1, s2):
  v1 = model.get_vector(s1)
  v2 = model.get_vector(s2)
  
  return 1 - cosine(v1, v2)

def d(s, S):
  return model.vocab[s].count / sum([model.vocab[_s].count for _s in S])

def weighted_cosine_similarity(s1, S1, s2, S2):
  return d(s1, S1) * d(s2, S2) * (cosine_similarity(s1, s2) ** 8)

In [0]:
def get_associated_sense_embeddings(w):
  S = []
  for v in model.vocab:
    t = v.split('_')
    l = ' '.join(t[:-1])
    if w == l and len(t) > 1:
      S.append(v)
      
  return S

In [0]:
path = 'wordsim353/combined.tab'

gold_scores, my_scores = [], []

with open(path) as f:
  # skip header
  next(f)
  
  for line in f:
    w1, w2, sim = line.lower().strip().split('\t')
    
    S1 = get_associated_sense_embeddings(w1)
    S2 = get_associated_sense_embeddings(w2)
    
    score = -1.0
    
    for s1 in S1:
      for s2 in S2:
        score = max(score, similarity_measure(s1, S1, s2, S2))
      
    my_scores.append(score)
    gold_scores.append(float(sim))
    
    
r = spearmanr(gold_scores, my_scores)

# Visualize Embeddings

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

### PCA

In [0]:
w1, w2 = 'love', 'hate'
n1, n2 = get_associated_sense_embeddings(w1), get_associated_sense_embeddings(w2)
all_words = n1
all_words.extend(n2)


def visualize_pca(words):
    V = model[words]
    pca = PCA(n_components=2)
    result = pca.fit_transform(V)
    plt.scatter(result[:, 0], result[:, 1])

    for i, word in enumerate(words):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]))

    plt.grid(True)
    plt.show()

visualize_pca(all_words)

### t-SNE

In [0]:
# Source: https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d
def get_clusters(all_words):
  embedding_clusters = []
  word_clusters = []
  for word in all_words:
    embeddings = []
    words = []
    for similar_word, _ in model.most_similar(word, topn=15):
        words.append(similar_word)
        embeddings.append(model[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

  embedding_clusters = np.array(embedding_clusters)
  n, m, k = embedding_clusters.shape
  tsne_model_en_2d = TSNE(
      perplexity=15, n_components=2, init='pca',
      n_iter=3500, random_state=0)
  embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(
      embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
  return embeddings_en_2d, word_clusters

def tsne_plot_similar_words(labels, embedding_clusters, word_clusters, filename=None):
  plt.figure(figsize=(16, 9))
  colors = cm.rainbow(np.linspace(0, 1, len(labels)))
  for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
    x = embeddings[:, 0]
    y = embeddings[:, 1]
    plt.scatter(x, y, c=color, alpha=1.0, label=label)
    for i, word in enumerate(words):
      plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                       textcoords='offset points', ha='right', va='bottom', size=8)
  plt.legend(loc=4)
  plt.grid(True)
  if filename:
      plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
  plt.show()


all_words = ['seek_01315613v', 'make_up_02520730v',
  'queen_10499355n', 'function_01095218v', 'liner_03673027n']
embeddings_en_2d, word_clusters = get_clusters(all_words)
tsne_plot_similar_words(all_words, embeddings_en_2d, word_clusters,
                          'similar_words.png')

### kNN

In [0]:
def jaccard_similarity(v1, v2):
    intersection = np.dot(v1, v2)
    union = (np.linalg.norm(v1) * 2 +
                   np.linalg.norm(v2) * 2 - intersection)
    return np.round(intersection / union, 3)

In [0]:
w1 = 'bank_09213565n' # river bank 09213565n
w2 = 'bank_08420278n' # financial inst "08420278n"

cw = w2
v1 = model.get_vector(cw)
for sw in model.similar_by_word(cw, topn=10):
  v2 = model.get_vector(sw[0])
  print(cw, sw, jaccard_similarity(v1, v2))