In [10]:
import gensim
import re
import operator
import numpy as np
import nltk
import pandas as pd
from gensim.models import Word2Vec
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import brown, movie_reviews, treebank

# Training Word2vec model

In [2]:
# training your own word2vec model using specific dataset 
def generateData():
    sample_sentences = ['Because a truck flipped over, we have been stuck in stationary traffic for over three hours now.',
                        'There was a stationery store at street level, with a doorway on one side with a sign above it that said `Elmar Dance Company, 2nd Floor.',
                        'When the train engineer realized there was a stationary car on the tracks, he tried to stop the locomotives engine.',
                        'A stationary train was standing in the platform','I went to a stationery shop to buy a bundle of pencils'
                       ]
    tokenized_sentence = [word_tokenize(x) for x in sample_sentences]
    final_dataset = list(treebank.sents())
    final_dataset.extend(list(brown.sents()))
    final_dataset.extend(list(movie_reviews.sents()))
    final_dataset.extend(tokenized_sentence)
    return final_dataset

def word2vec_model(final_dataset=list(), custom=False):
    print('Inside the model loading module')
    if custom and final_dataset:
        model = Word2Vec(final_dataset,window=2,workers=4, min_count=2)
    else:
        # mention the path of downloaded google pretrained word2vec model or you can train your own model with your own data
        # link
        model = gensim.models.KeyedVectors.load_word2vec_format('./detecting_wsd/GoogleNews-vectors-negative300.bin', binary=True)  
    print('Model loading complete')
    return model


# Get the list of all possible homonyms

In [3]:
# get the list of all possible homonyms
def homonyms_file_list():
    """
    Return two dicts
    homonyms_dict - a dict consisting of all the clubbed homonyms
    reverse_homonym_dict - a dict consisting of every homonym as a index and value as a group id of homonym
    """
    # specify the path of homonym list present
    homonyms_data = pd.read_csv('./homonyms_list.csv', delimiter=',')
    group_homonyms = homonyms_data.groupby(['relation_id', 'spelling'])
    homonyms_dict = defaultdict(list)
    reverse_homonym_dict = dict()
    for x in list(group_homonyms['spelling']):
        homonyms_dict[x[0][0]].append(x[0][1])
        reverse_homonym_dict[x[0][1]] = x[0][0]
    return homonyms_dict, reverse_homonym_dict

In [4]:

# input_sentence = 'I am standing near a stationary shop which is near to highway'
# input_sentence = 'The effect of Persistent Sleepiness'
# input_sentence = 'Downed Power Line effect PNM Customer'
# input_sentence = 'I went two a shop to by a mobile phone'

In [5]:
# nltk.pos_tag(word_tokenize(input_sentence))

In [14]:
# main logic to calculate the wrong homonyms used in the input sentence

def check_correct_homonym(model,input_sentence):
    # load model
    homonyms_dict, reverse_homonym_dict = homonyms_file_list()
    tokenized_input = word_tokenize(input_sentence)
    tagged_input = nltk.pos_tag(tokenized_input)
    # filter all the words with a specific pos tags
    tokenized_filtered_input = [x[0] for x in tagged_input if re.match(r'(^C|^J|^N|^R|^V|^I|^T|^W)',x[1])]
    # words on which similarity is to be found
    filter_tagged_words = [x[0] for x in tagged_input if re.match(r'(^N|^J)',x[1])]
    best_fit_word_list = list()
    for word in tokenized_filtered_input:
        word = word.lower()
        if word in reverse_homonym_dict:
            found_word = word
            # get the list of all possible homonyms words
            possible_homonyms = homonyms_dict[reverse_homonym_dict[word.lower()]]
            score_list = list()
            homonym_score_dict = dict()
            # need optimization
            for x in possible_homonyms:
                for y in filter_tagged_words:
                    if y not in possible_homonyms:
                        pass
                        # do the calculation of similarity part
                homonym_score_dict[x] = np.array(score_list).mean()
            best_homonym = max(homonym_score_dict.items(), key=operator.itemgetter(1))[0]
            if best_homonym==found_word:
                flag,replace_word = True,found_word   
            else: 
                flag,replace_word = False,best_homonym
            best_fit_word_list.append((found_word, flag, replace_word))
    # returns a list [(found_word, flag_to_replace_word, replace_word)]
    return best_fit_word_list    


In [17]:
def find_word_replace_input(model):
    input_sentence = ' Because a truck flipped over, we have been stuck in stationery traffic for over three hours now.'
    homonyms_word_list = check_correct_homonym(model, input_sentence)
    print('Input sentence : {}\n'.format(input_sentence))
    for homonym in homonyms_word_list:
        if homonym[1]:
            print('{} is correct in input sentence \n '.format(homonym[0]))
        else:
            print('{found} is incorrect in input sentence and should be replaced with {replace} \n'.format(found=homonym[0],replace=homonym[2]))
        
     

In [8]:
model = word2vec_model()

Inside the model loading module
Model loading complete


In [18]:
find_word_replace_input(model)

Input sentence :  Because a truck flipped over, we have been stuck in stationery traffic for over three hours now.

have is correct in input sentence 
 
been is incorrect in input sentence and should be replaced with bin 

in is correct in input sentence 
 
stationery is incorrect in input sentence and should be replaced with stationary 

for is correct in input sentence 
 
hours is correct in input sentence 
 
