In [19]:
from typing import List

from nltk import download, word_tokenize
from nltk.data import find
from nltk.corpus import wordnet
from nltk.corpus import stopwords as stopwords_reader

from scipy.spatial.distance import cosine

import numpy as np
import gensim

download('wordnet')
download('stopwords')
download('punkt')
download('word2vec_sample')

stopwords = set(stopwords_reader.words("english"))
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


In [20]:
example_sentence = "Yesterday I went to the bank to withdraw the money and the credit card did not work"
example_word = "bank"

In [21]:
def simplified_lesk(word: str, sentence: str) -> str:
  def tokenize_no_stopwords(sentence: str) -> List[str]:
    tokens = set(word_tokenize(sentence))
    return [w.lower() for w in tokens if not w.lower() in list(stopwords)]

  senses = wordnet.synsets(word)
  best_sense = None
  max_overlap = 0
  context = set(tokenize_no_stopwords(sentence))

  for sense in senses:
    signature = set(tokenize_no_stopwords(sense.definition()))

    for example in sense.examples():
      signature = signature.union(tokenize_no_stopwords(example))
    overlap = len(signature.intersection(context))

    if overlap > max_overlap:
      max_overlap = overlap
      best_sense = sense
  return best_sense

In [22]:
simplified_lesk(example_word, example_sentence)

Synset('depository_financial_institution.n.01')

In [23]:
def word_embeddings_smiliarity(word: str, sentence: str) -> str:
  def tokenize_no_stopwords(sentence: str) -> List[str]:
    tokens = set(word_tokenize(sentence))
    return [w.lower() for w in tokens if not w.lower() in list(stopwords)]

  def words_to_vector(words: set) -> np.array:
    vectros = []
    vector_mean = model.vectors.mean(axis=0)
    for word in words:
      vector = vector_mean
      vector_word = model.vocab.get(word, False)
      if vector_word:
        vector = model[word]

      vectros.append(vector)
    
    return np.mean(np.asarray(vectros), axis = 0)

  context_vector = words_to_vector(set(tokenize_no_stopwords(sentence)))
  max_overlap = -1
  best_sense = None

  for sense in wordnet.synsets(word):
    signature = set(tokenize_no_stopwords(sense.definition()))

    for example in sense.examples():
      signature = signature.union(tokenize_no_stopwords(example))

    signature_vector = words_to_vector(signature)
    overlap = 1 - cosine(context_vector, signature_vector)

    if overlap > max_overlap:
      max_overlap = overlap
      best_sense = sense
  return best_sense

In [24]:
word_embeddings_smiliarity(example_word, example_sentence)

Synset('depository_financial_institution.n.01')