In [44]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import wordnet as wn
import nltk
import numpy as np
from scipy.spatial.distance import cosine
import operator

In [2]:
glove_file="../data/glove.6B.100d.100K.txt"
original_file="../data/glove.6B.100d.100K.w2v.txt"
n, dimension = glove2word2vec(glove_file, original_file)

  n, dimension = glove2word2vec(glove_file, original_file)


In [3]:
wv = KeyedVectors.load_word2vec_format(original_file, binary=False)

Q1: Implement the Lesk algorithm using word vectors \([Basile et al. 2014](https://www.aclweb.org/anthology/C/C14/C14-1151.pdf)\), where we measure the similarity between a gloss g = $\{ g_1, \ldots, g_G \}$ and context c = $\{ c_1, \ldots, c_C \}$ as the cosine similarity between the sum of distributed representations:

$$
\cos \left(\sum_{i=1}^G g_i, \sum_{i=1}^C c_i  \right)
$$

* The gloss for a synset can be found in `synset.definition()`; be sure to tokenize it appropriately.  
* You can find the cosine *distance* (not similarity) between two vectors using the `scipy.spatial.distance.cosine(vector_one, vector_two)` function.
* `wn.synsets(word, pos=part_of_speech)` gets you a list of the synsets for a word with a specific part of speech (e.g., "n" for noun)

In [48]:
float("inf")

inf

In [91]:
def lesk(word, sentence, part_of_speech):
    # Find all possible word senses
    possible_senses = wn.synsets(word, pos=part_of_speech)
    
    # tokenize the target sentence
    sent_tokens = nltk.tokenize.word_tokenize(sentence.lower()) 
    sent_vec = np.ones(dimension) * 0
    
    # under Lesk algo, sum the vector representations for the words in the sent
    for sent_token in sent_tokens:
        sent_vec = sent_vec + wv[sent_token]
        
    # Setting up the comparisons
    min_dist = float("inf")
    best_sense = None
    
    for sense in possible_senses:
        # For each sense, tokenize the gloss
        sense_rep = nltk.tokenize.word_tokenize(sense.definition().lower())
        sense_vec = np.ones(dimension) * 0
        
        # Create the sum of vector representations for words in the gloss
        for sense_token in sense_rep:
            sense_vec = sense_vec + wv[sense_token]
        
        # Cosine distance between both vectors
        sense_to_sent_dist = cosine(sense_vec, sent_vec)
        
        # Opposite of similarity; therefore, lower distance = more similar
        if sense_to_sent_dist < min_dist:
            # If similar, set the smallest distance and create a new best sense
            min_dist = sense_to_sent_dist
            best_sense = sense
    
    print("The best synset is", best_sense, "\n")
    print("WordNet definition:", best_sense.definition())

Execute the following two cells to check whether your implementation distinguishes between these two senses of "bank".

In [92]:
lesk("bank", "I deposited my money into my savings account at the bank", "n")

The best synset is Synset('depository_financial_institution.n.01') 

WordNet definition: a financial institution that accepts deposits and channels the money into lending activities


In [93]:
lesk("bank", "I ran along the river bank", "n")

The best synset is Synset('bank.n.07') 

WordNet definition: a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force


Overall, the implementation does distinguish between the two senses of "bank." However, while it does distinguish between the two senses, it does seem to struggle with finding the "correct" definition, particularly with the "I ran along the river bank" example. It looks like the inclusion of "I ran" confuses the algorithm, since it assumes that it's referring to a sport or athlete (who might run on a road or track) instead of land by water (which would be flagged by river). Removing that phrase does seem to create the correct definition, however -- maybe we would need to weight various words? 

In [95]:
# Adding this to see if it can differentiate
# One issue: it looks like the phrase "I ran" leads to the wrong definition
lesk("bank", "along the river bank", "n")

The best synset is Synset('bank.n.01') 

WordNet definition: sloping land (especially the slope beside a body of water)
