In [61]:
import  nltk
# nltk.download('averaged_perceptron_tagger')

import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
en_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/danil-
[nltk_data]     pass123/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
import nltk
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

def apply_to_text(text):
    sentences = nltk.tokenize.sent_tokenize(text)
    return ' '.join([apply(s) for s in sentences])
    

def apply(sentence):
    
    tokens = nltk.word_tokenize(sentence)
    tokens = [t.lower() for t in tokens]
    
    new_tokens = []
    for t in tokens:
        if t not in en_stopwords:
            new_tokens.append(t)
    tokens = new_tokens
    
    new_sentance = ''
    for w in tokens:
        pos = nltk.pos_tag([w])[0][1]
        
        res = lesk(context_sentence = tokens, ambiguous_word=w, pos= None)
        if res is None:
            new_sentance = new_sentance + w + ' '
            continue
        
        new_sentance = new_sentance + res.definition() + ' '

    return new_sentance

In [63]:
sentence = 'Word is only a part of sentence.'

In [64]:
apply(sentence)

'a word is a string of bits stored in computer memory the part played by a person in bringing about a result pronounce a sentence on (somebody) in a court of law . '

In [65]:
apply_to_text('Word is only a part of sentence. Sentence is only a tiny part of text')

'a word is a string of bits stored in computer memory the part played by a person in bringing about a result pronounce a sentence on (somebody) in a court of law .  pronounce a sentence on (somebody) in a court of law very small the part played by a person in bringing about a result a book prepared for use in schools or colleges '

In [None]:
#GloVe source:
# https://nlp.stanford.edu/projects/glove/

In [70]:
import os



['glove.6B.zip', 'glove.6B', 'archive.zip', 'Sentiment-Analysis-Dataset.zip']

In [90]:
import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
import numpy as np
from numpy import dot
from numpy import average
from numpy.linalg import norm
import os

# from gensim.models import Word2Vec
# from gensim.scripts.glove2word2vec import glove2word2vec

# glove2word2vec(glove_file, tmp_file)

GLOVE_DEF_PATH = os.path.join(os.getcwd(),'../datasets/glove/glove.6B/glove.6B.50d.txt')
def load_glove_vectors(glove_file = GLOVE_DEF_PATH):
    
    f = open(glove_file, 'r', encoding="utf-8")
    vectors = {}
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array([float(val) for val in split_line[1:]])
        vectors[word] = embedding
    f.close()
    return vectors


cosine_sim_threshold = 0.05
score_margin_threshold = 0.1


def get_valid_pos_tag(tag):
    if tag.startswith('J') or tag.startswith('V') or tag.startswith('N') or tag.startswith('R'):
        return True
    return False


def get_word_sense_vectors(candidate):
    vectors = {}
    try:
        candidate_vec = glove[candidate]
    except Exception:
        # print(candidate, "not found in glove")
        return None
    for sense in wn.lemmas(candidate):
        # if candidate == "bank":
        # print("synonym of ", candidate, " is ", ss.lemmas()[0].name())
        # print("key of ", candidate, " is ", ss.lemmas()[0].key())
        gloss = [sense.synset().definition()]
        gloss.extend(sense.synset().examples())
        word_vectors = []
        for sentence in gloss:
            tokens = nltk.word_tokenize(sentence)
            pos_tags = nltk.pos_tag(tokens)
            for gloss_pos, tag in pos_tags:
                if get_valid_pos_tag(tag):
                    try:
                        gloss_word_vec = glove[gloss_pos]
                    except Exception:
                        # print(gloss_pos, "not found in glove")
                        continue
                    cos_sim = dot(gloss_word_vec, candidate_vec) / (norm(gloss_word_vec) * norm(candidate_vec))
                    if cos_sim > cosine_sim_threshold:
                        word_vectors.append(gloss_word_vec)
        if len(word_vectors) == 0:
            continue
        sense_vector = average(word_vectors, 0)
        vectors[sense] = sense_vector
    return vectors


def disambiguate_word_sense(word, context_vector):
    vectors = sense_vectors_collection[word]
    if len(vectors) == 0:
        return [None, 0.0]
    cos_sims = {}
    for sense, sense_vector in vectors.items():
        cos_sim = dot(context_vector, sense_vector) / (norm(context_vector) * norm(sense_vector))
        cos_sims[sense] = cos_sim
    sorted_list = sorted(cos_sims.items(), key=lambda x: x[1])
    if len(sorted_list) == 0:
        return [None, 0.0]
    most_similar_pair = sorted_list.pop()
    disambiguated_sense = most_similar_pair[0]
    cos_sim_second_most_similar_sense = 0
    if len(sorted_list) > 0:
        cos_sim_second_most_similar_sense = sorted_list.pop()[1]
    score_margin = most_similar_pair[1] - cos_sim_second_most_similar_sense
    # we return the disambiguated sense AND the cosine score margin between the two most similar senses.
    return [disambiguated_sense, score_margin]


sense_vectors_collection = {}
sorted_sense_vectors_collection = {}

def run_algorithm(sentence):
    global sorted_sense_vectors_collection,sense_vectors_collection
    tokens_input = nltk.word_tokenize(sentence)
    pos_tags_input = nltk.pos_tag(tokens_input)
    
    pos = []
    pos_vectors = {}
    for word, pos_tag in pos_tags_input:
        # print(word, "is tagged as", pos_tag)
        if get_valid_pos_tag(pos_tag):
            try:
                pos_vectors[word] = glove[word]
                pos.append(word)
            except Exception:
                pass
                
    # Sense vectors init
    for p in pos:
        sense_vectors = get_word_sense_vectors(p)
        if sense_vectors is None:
            continue
        sense_vectors_collection[p] = sense_vectors
        sorted_sense_vectors_collection[p] = len(sense_vectors)
    
    # S2C sorting for content word
    sorted_sense_vectors_collection = sorted(sorted_sense_vectors_collection.items(), key=lambda x: x[1])
    
    # Context vector initialization
    context_vec = average(list(pos_vectors.values()), 0)
    wn_key = "not found"
    
    
    for w, _ in sorted_sense_vectors_collection:
        disambiguation_results = disambiguate_word_sense(w, context_vec)
        disambiguated_sense = disambiguation_results[0]
        if disambiguated_sense is None:
            continue
        # if w == lookup_word:
        #     wn_key = disambiguated_sense._key
        #     break
        score_margin = disambiguation_results[1]
        if score_margin > score_margin_threshold:
            pos_vectors[w] = sense_vectors_collection[w][disambiguated_sense]
            context_vec = average(list(pos_vectors.values()), 0)
    # print(pos_vectors.keys())
    sense_vectors_collection.clear()
    return context_vec

# glove = load_glove_vectors('/media/iftekhar/New Volume/Personal/Admission Docs/Germany/RWTH/MI/Lab - AI_Language_Technology/training_nball47634/glove.6B.50d.txt')
# glove = load_glove_vectors('/media/iftekhar/New Volume/Personal/Admission Docs/Germany/RWTH/MI/Lab - AI_Language_Technology/deps.words')
# glove = load_glove_vectors('/media/iftekhar/New Volume/Personal/Admission Docs/Germany/RWTH/MI/Lab - AI_Language_Technology/bow2.words')
# glove = load_glove_vectors('/media/iftekhar/New Volume/Personal/Admission Docs/Germany/RWTH/MI/Lab - AI_Language_Technology/bow5.words')
# glove = load_glove_vectors('E:/Code/deps.words/deps.words')
# sense_vectors_collection = {}

# annotation_results = dict()


# def find_wn_key(sentence, lookup_word):
#     sorted_sense_vectors_collection = {}
#     pos = []
#     pos_vectors = {}
#     tokens_input = nltk.word_tokenize(sentence)
#     pos_tags_input = nltk.pos_tag(tokens_input)
#     for word, pos_tag in pos_tags_input:
#         # print(word, "is tagged as", pos_tag)
#         if get_valid_pos_tag(pos_tag):
#             try:
#                 pos_vectors[word] = glove[word]
#                 pos.append(word)
#             except Exception:
#                 pass
#                 # print(pos, " not found in glove")
#     for p in pos:
#         sense_vectors = get_word_sense_vectors(p)
#         if sense_vectors is None:
#             continue
#         sense_vectors_collection[p] = sense_vectors
#         sorted_sense_vectors_collection[p] = len(sense_vectors)
#     # S2C sorting for content word
#     sorted_sense_vectors_collection = sorted(sorted_sense_vectors_collection.items(), key=lambda x: x[1])
#     # print("sorted by sense count", sorted_sense_vectors_collection)
#     # Context vector initialization
#     context_vec = average(list(pos_vectors.values()), 0)
#     wn_key = "not found"
#     for w, _ in sorted_sense_vectors_collection:
#         disambiguation_results = disambiguate_word_sense(w, context_vec)
#         disambiguated_sense = disambiguation_results[0]
#         if disambiguated_sense is None:
#             continue
#         if w == lookup_word:
#             wn_key = disambiguated_sense._key
#             break
#         score_margin = disambiguation_results[1]
#         if score_margin > score_margin_threshold:
#             pos_vectors[w] = sense_vectors_collection[w][disambiguated_sense]
#             context_vec = average(list(pos_vectors.values()), 0)
#     # print(pos_vectors.keys())
#     sense_vectors_collection.clear()
#     return wn_key




# output_file = open("output_bow2_windows.txt", "w")
# results_file = open("wsd_results_bow2_windows.txt", "w")
# output_file = open("output_bow5_windows.txt", "w")
# results_file = open("wsd_results_bow5_windows.txt", "w")
# output_file = open("output_glove.6B.50d_windows.txt", "w")
# results_file = open("wsd_results_glove.6B.50d_windows.txt", "w")
# output_file = open("output_deps_windows.txt", "w")
# results_file = open("wsd_results_deps_windows.txt", "w")

# load_annotations()

# correct_count = 0
# invalid_linkup_key_count = 0
# total_sentence_count = 0

# for dirpath, _, filenames in os.walk("E:/Code/hlt2005releasev2/hlt2005releasev2/domainhltGS.tar/Annotated_Sentences"):
#     if len(filenames) == 0:
#         continue
#     for file in filenames:
#         f = open(os.path.join(dirpath, file), 'r', encoding='ISO-8859-1')
#         #from itertools import islice

#         #for line in islice(f, 1):
#         for line in f:
#             split_line = line.split('?')
#             metadata_array = split_line[0].split(' ')
#             linkup_key = metadata_array[0]
#             lookup_word = metadata_array[2]
#             sentence = split_line[1].split(' ', 2)[2]
#             wn_key = find_wn_key(sentence, lookup_word)
#             results_file.write("|" + linkup_key + "|     " + wn_key + "\n")
#             if linkup_key in annotation_results:
#                 total_sentence_count += 1
#                 wn_keylist = annotation_results[linkup_key]
#                 if len(wn_keylist) > 0:
#                     most_frequent_wn_key = max(set(wn_keylist), key=wn_keylist.count)
#                     if most_frequent_wn_key == wn_key:
#                         print("correct wsd for", linkup_key, wn_key)
#                         output_file.write("correct wsd for " + linkup_key + " " + wn_key + "\n")
#                         correct_count += 1
#                         print("correct", correct_count, "| total", total_sentence_count)
#                         output_file.write("correct " + str(correct_count) + " | total " + str(total_sentence_count) + "\n")
#                     else:
#                         print("incorrect wsd for", linkup_key, "| found", wn_key, ", correct is", most_frequent_wn_key)
#                         output_file.write("incorrect wsd for " + linkup_key + " | found " + wn_key + ", correct is " + most_frequent_wn_key + "\n")
#             else:
#                 invalid_linkup_key_count += 1
#                 print("linkup key", linkup_key, "not found in gold standard clean dataset")
#                 output_file.write("linkup key " + linkup_key + " not found in gold standard clean dataset\n")


In [91]:
glove = load_glove_vectors()

In [92]:
get_word_sense_vectors('key')

{Lemma('key.n.01.key'): array([ 0.23398828,  0.06947961, -0.10388128, -0.0099533 ,  0.43481728,
         0.29401644,  0.20012272, -0.43976511, -0.13210233,  0.05348277,
         0.24428633,  0.05993198, -0.21467228, -0.06295217, -0.04845044,
         0.20116533, -0.09243417,  0.01657027,  0.18980996, -0.27478444,
        -0.02330133, -0.31139944, -0.29876339, -0.07223167,  0.08578778,
        -1.26486   , -0.35051844, -0.13795044,  0.29510317,  0.24838428,
         2.94410167, -0.24746111, -0.07286117, -0.72782778,  0.00712588,
        -0.03365111, -0.01599744,  0.18017395,  0.01516489,  0.35594933,
        -0.23689944, -0.13363701,  0.18609783,  0.42248206, -0.14563406,
         0.41616411, -0.05148722,  0.25225161,  0.07375754, -0.07387137]),
 Lemma('key.n.07.Key'): array([-0.01803425,  0.3279965 , -0.5628375 , -0.43857317,  0.40369858,
         0.27693233, -0.45595677,  0.14854517, -0.40123083,  0.36382783,
        -0.25543275,  0.24510917, -0.254656  , -0.18743725,  0.547935  ,
   

In [93]:
run_algorithm("Here is sentence")

array([ 2.01247350e-01,  2.89317000e-01, -3.74097300e-01, -2.59285000e-02,
        5.85898000e-01,  3.85850650e-01,  9.84151500e-02, -1.18100750e-01,
        4.95730000e-04,  8.60915000e-02,  9.70003500e-02,  7.95810000e-02,
       -1.64794365e-01, -2.91452500e-01,  5.42528000e-01,  2.19557315e-01,
        1.61640350e-01,  2.59238000e-01, -1.19468990e-01,  9.10579500e-02,
       -2.25948400e-01,  3.12650000e-02,  8.43702000e-02,  3.12949295e-01,
        3.94029000e-02, -2.00003450e+00, -2.61442000e-01,  2.54413450e-01,
        1.46159500e-02, -1.27636000e-01,  3.04883350e+00, -6.52302500e-01,
       -3.13209700e-01, -4.75266475e-01,  1.84038150e-01, -2.36682401e-01,
        3.69803500e-01,  4.26101150e-02,  7.40252000e-02, -1.52490750e-01,
       -2.24771500e-01,  2.70650500e-01, -1.83526500e-02,  1.74195150e-01,
       -3.27736800e-01,  1.94538500e-02, -2.38214300e-01, -2.28488500e-01,
        1.62012500e-01,  2.26826500e-01])

In [None]:
sorted_sense_vectors_collection