### Importing required libraries

In [1]:
# importing required libraries

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [2]:
def remove_stpwords(sent):
    stp_wrds = set(stopwords.words('english'))    # retrieving unique stop words in english.
    wrd_tkns = nltk.word_tokenize(sent)       # tokenize sentence passed to this fn.
    
    filtered_sentence = []                  # initializing an empty list for storing key words (excluding stop words).
    
    for wrd in wrd_tkns:
        if wrd not in stp_wrds:             # if the word is a stop word it will not be added to the filtered list.
            filtered_sentence.append(wrd)
            
    return filtered_sentence                # returning the list without stop-words of english language.

In [31]:
# LESK algo returns the best definition of the sense in which the words are supposed to be used.
# it returns a list which contains Tuples with word and its definition.
# E.g.
# [
#     (word1, best definition of word1), - Tuple 1
#     (word2, best definition of word2), - Tuple 2
#     ...
# ]

# List of parameters passed to the LESK algo fn.
# words: the list of words for which sense definition needs to be found.
# sents: the sentence in which the association b/w word and sense will be found.

def lesk_algo(words, sent):
    wrd_def = []                           # var in which word and its definition will be stored.
    split_sent = remove_stpwords(sent.lower())     # splitting the sentence and removing the stop words.
        
    for word in words:                     # finding synsets for each word passed in the 'words' variable.
    # {
        max_overlap = -1                   # initialized as -1, cause overlap may remain 0 after finding the 
                                           # intersection b/w sense definition and the word, for all the senses.
                                           # In such a case no sense definition will be returned. Hence, initializing 
                                           # it as -1 so that the 1st sense is returned. 
                                           # (value of overlap = 0 and max_overlap = -1).
        best_sense = ''                    # initializing variable.
        
        for sense in wordnet.synsets(word.lower()):
        # {
            overlap = len(                    # finding the length of intersected words and assigning it to a var.
                set(                          
                    nltk.word_tokenize(sense.definition()) # finding the length of unique words in the sense 
                                                           # definition text.
                ).intersection(                            # finding the words which intersets with the words
                    split_sent                             # retrieved from the sentences passed to this fn.
                )
            )
            
            print('%s  %s' % (word, sense.definition()))

            if overlap > max_overlap:            # if another sense has more common words, then new sense is 
                max_overlap = overlap            # assigned to the best_sense variable to hold the sense till a 
                best_sense = sense.definition()  # better one is identified.
        # } End of sense for-loop.
        
        wrd_def.append(best_sense)
    # } End of word for-loop
    
    return list(zip(words, wrd_def))


In [18]:
# LESK algo returns the best definition of the sense in which the words are supposed to be used.
# it returns a list which contains Tuples with word and its definition.
# E.g.
# [
#     (word1, best definition of word1), - Tuple 1
#     (word2, best definition of word2), - Tuple 2
#     ...
# ]

# List of parameters passed to the LESK algo fn.
# sents: the sentence in which the association b/w word and sense will be found.

def lesk_algo_sent(sent):
    wrd_def = []                           # var in which word and its definition will be stored.
    wrds4def = split_sent = remove_stpwords(sent.lower())     # splitting the sentence and removing the stop words.
    
    wrd_lst = []
    prev_next_wrd = []
    
    for key, wrd in enumerate(split_sent):
        if key == 0:
            prev_next_wrd = [split_sent[key+1]]
            
        elif key > 0 and key < (len(split_sent)-1):
            prev_next_wrd = [split_sent[key-1], split_sent[key+1]]
            
        elif key == (len(split_sent)-1):
            prev_next_wrd = [split_sent[key-1]]
            
        wrd_lst.append(tuple([prev_next_wrd, wrd]))
        
    print(wrd_lst)
    
    for word in split_sent:                     # finding synsets for each word passed in the 'words' variable.
    # {
        max_overlap = -1                   # initialized as -1, cause overlap may remain 0 after finding the 
                                           # intersection b/w sense definition and the word, for all the senses.
                                           # In such a case no sense definition will be returned. Hence, initializing 
                                           # it as -1 so that the 1st sense is returned. 
                                           # (value of overlap = 0 and max_overlap = -1).
        best_sense = ''                    # initializing variable.
        
        for sense in wordnet.synsets(word.lower()):
        # {
            overlap = len(                    # finding the length of intersected words and assigning it to a var.
                set(                          
                    nltk.word_tokenize(sense.definition()) # finding the length of unique words in the sense 
                                                           # definition text.
                ).intersection(                            # finding the words which intersets with the words
                    wrds4def                               # retrieved from the sentences passed to this fn.
                )
            )

            if overlap > max_overlap:            # if another sense has more common words, then new sense is 
                max_overlap = overlap            # assigned to the best_sense variable to hold the sense till a 
                best_sense = sense.definition()  # better one is identified.
                
        # } End of sense for-loop.        
        
        wrds4def = wrds4def + remove_stpwords(best_sense)
        wrd_def.append(best_sense)
    # } End of word for-loop
    
    #print(wrds4def)
    
    return list(zip(split_sent, wrd_def))


In [27]:
# LESK algo returns the best definition of the sense in which the words are supposed to be used.
# it returns a list which contains Tuples with word and its definition.
# E.g.
# [
#     (word1, best definition of word1), - Tuple 1
#     (word2, best definition of word2), - Tuple 2
#     ...
# ]

# List of parameters passed to the LESK algo fn.
# sents: the sentence in which the association b/w word and sense will be found.

def lesk_algo_sent(sent):
    wrd_def = []                           # var in which word and its definition will be stored.
    wrds4def = split_sent = remove_stpwords(sent.lower())     # splitting the sentence and removing the stop words.
    
    wrd_lst = []
    prev_next_wrd = []
    
    for key, wrd in enumerate(split_sent):
        if key == 0:
            prev_next_wrd = [split_sent[key+1]]
            
        elif key > 0 and key < (len(split_sent)-1):
            prev_next_wrd = [split_sent[key-1], split_sent[key+1]]
            
        elif key == (len(split_sent)-1):
            prev_next_wrd = [split_sent[key-1]]
            
        wrd_lst.append(tuple([prev_next_wrd, wrd]))
    
    for word in wrd_lst:                     # finding synsets for each word passed in the 'words' variable.
    # {
        max_overlap = -1                   # initialized as -1, cause overlap may remain 0 after finding the 
                                           # intersection b/w sense definition and the word, for all the senses.
                                           # In such a case no sense definition will be returned. Hence, initializing 
                                           # it as -1 so that the 1st sense is returned. 
                                           # (value of overlap = 0 and max_overlap = -1).
        best_sense = ''                    # initializing variable.
        
        print('%s  -> %s' % (word[0], word[1]))
        
        #for sense in wordnet.synsets(word[1].lower()):
        ## {
        #    overlap = len(                    # finding the length of intersected words and assigning it to a var.
        #        set(                          
        #            nltk.word_tokenize(sense.definition()) # finding the length of unique words in the sense 
        #                                                   # definition text.
        #        ).intersection(                            # finding the words which intersets with the words
        #            wrds4def                               # retrieved from the sentences passed to this fn.
        #        )
        #    )
#
        #    if overlap > max_overlap:            # if another sense has more common words, then new sense is 
        #        max_overlap = overlap            # assigned to the best_sense variable to hold the sense till a 
        #        best_sense = sense.definition()  # better one is identified.
        #        
        ## } End of sense for-loop.        
        #
        #wrds4def = wrds4def + remove_stpwords(best_sense)
        #wrd_def.append(best_sense)
    # } End of word for-loop
    
    #print(wrds4def)
    
    return list(zip(split_sent, wrd_def))


### Verifying

In [5]:
sentence = "The frog is jumping around the bank of the river"
words = ['bank', 'jumping', 'leaping', 'frog']

In [32]:
lesk_algo(words, sentence)

bank  sloping land (especially the slope beside a body of water)
bank  a financial institution that accepts deposits and channels the money into lending activities
bank  a long ridge or pile
bank  an arrangement of similar objects in a row or in tiers
bank  a supply or stock held in reserve for future use (especially in emergencies)
bank  the funds held by a gambling house or the dealer in some gambling games
bank  a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
bank  a container (usually with a slot in the top) for keeping money at home
bank  a building in which the business of banking transacted
bank  a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
bank  tip laterally
bank  enclose with a bank
bank  do business with a bank or keep an account at a bank
bank  act as the banker in a game or in gambling
bank  be in the banking business
bank  put into a bank acc

[('bank', 'enclose with a bank'),
 ('jumping', 'the act of jumping; propelling yourself off the ground'),
 ('leaping', 'a light, self-propelled movement upwards or forwards'),
 ('frog',
  'any of various tailless stout-bodied amphibians with long hind limbs for leaping; semiaquatic and terrestrial species')]

In [28]:
# ['frog', 'jumping', 'around', 'bank', 'river']
lesk_algo_sent(sentence)

['jumping']  -> frog
['frog', 'around']  -> jumping
['jumping', 'bank']  -> around
['around', 'river']  -> bank
['bank']  -> river


[]