In [1]:
import nltk
from nltk.wsd import lesk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import pandas as pd
import numpy as np
from numpy import dot
from numpy import average
from numpy.linalg import norm
import os
from tqdm.auto import tqdm

tqdm.pandas()
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
en_stopwords = stopwords.words('english')

In [2]:
def apply(tokens):
    tokens = [t.lower() for t in tokens]
    
    new_tokens = []
    for t in tokens:
        if t not in en_stopwords:
            new_tokens.append(t)
    tokens = new_tokens
    
    new_sentence = []
    for w in tokens:
        res = lesk(context_sentence=tokens, ambiguous_word=w)
        if res is None:
            new_sentence.append(w)
            continue
        else:
            tok = nltk.word_tokenize(res.definition())
            new_tokens = []
            for t in tok:
                if t not in en_stopwords:
                    new_tokens.append(t)
            tok = new_tokens
            new_sentence = new_sentence + tok
    return new_sentence

In [3]:
sentence = ['weve', 'talked', 'gifted', 'child', 'struggling', 'adult', 'pipeline', ',', 'also', 'old', 'soul', 'immature-seeming', 'adult', 'pipeline', '?', '(', 'spectrum', '?', ')']

In [4]:
apply(sentence)

['weve',
 'express',
 'speech',
 'endowed',
 'talent',
 'talents',
 'member',
 'clan',
 'tribe',
 'engaged',
 'struggle',
 'overcome',
 'especially',
 'poverty',
 'obscurity',
 'designed',
 'arouse',
 'lust',
 'pipe',
 'used',
 'transport',
 'liquids',
 'gases',
 ',',
 'addition',
 'preceding',
 'something',
 'else',
 'time',
 'order',
 'secular',
 'form',
 'gospel',
 'major',
 'Black',
 'musical',
 'genre',
 '1960s',
 '1970s',
 'immature-seeming',
 'designed',
 'arouse',
 'lust',
 'pipe',
 'used',
 'transport',
 'liquids',
 'gases',
 '?',
 '(',
 'broad',
 'range',
 'related',
 'objects',
 'values',
 'qualities',
 'ideas',
 'activities',
 '?',
 ')']

In [5]:
#GloVe source:
# https://nlp.stanford.edu/projects/glove/

In [6]:
GLOVE_DEF_PATH = os.path.join(os.getcwd(),'../datasets/glove/glove.6B/glove.6B.50d.txt')
def load_glove_vectors(glove_file = GLOVE_DEF_PATH):
    
    f = open(glove_file, 'r', encoding="utf-8")
    vectors = {}
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array([float(val) for val in split_line[1:]])
        vectors[word] = embedding
    f.close()
    return vectors


cosine_sim_threshold = 0.05
score_margin_threshold = 0.1


def get_valid_pos_tag(tag):
    if tag.startswith('J') or tag.startswith('V') or tag.startswith('N') or tag.startswith('R'):
        return True
    return False


def get_word_sense_vectors(candidate):
    vectors = {}
    try:
        candidate_vec = glove[candidate]
    except Exception:
        return None
    for sense in wn.lemmas(candidate):
        gloss = [sense.synset().definition()]
        gloss.extend(sense.synset().examples())
        word_vectors = []
        for sentence in gloss:
            tokens = nltk.word_tokenize(sentence)
            pos_tags = nltk.pos_tag(tokens)
            for gloss_pos, tag in pos_tags:
                if get_valid_pos_tag(tag):
                    try:
                        gloss_word_vec = glove[gloss_pos]
                    except Exception:
                        # print(gloss_pos, "not found in glove")
                        continue
                    cos_sim = dot(gloss_word_vec, candidate_vec) / (norm(gloss_word_vec) * norm(candidate_vec))
                    if cos_sim > cosine_sim_threshold:
                        word_vectors.append(gloss_word_vec)
        if len(word_vectors) == 0:
            continue
        sense_vector = average(word_vectors, 0)
        vectors[sense] = sense_vector
    return vectors


def disambiguate_word_sense(word, context_vector):
    vectors = sense_vectors_collection[word]
    if len(vectors) == 0:
        return [None, 0.0]
    cos_sims = {}
    for sense, sense_vector in vectors.items():
        cos_sim = dot(context_vector, sense_vector) / (norm(context_vector) * norm(sense_vector))
        cos_sims[sense] = cos_sim
    sorted_list = sorted(cos_sims.items(), key=lambda x: x[1])
    if len(sorted_list) == 0:
        return [None, 0.0]
    most_similar_pair = sorted_list.pop()
    disambiguated_sense = most_similar_pair[0]
    cos_sim_second_most_similar_sense = 0
    if len(sorted_list) > 0:
        cos_sim_second_most_similar_sense = sorted_list.pop()[1]
    score_margin = most_similar_pair[1] - cos_sim_second_most_similar_sense
    # we return the disambiguated sense AND the cosine score margin between the two most similar senses.
    return [disambiguated_sense, score_margin]


sense_vectors_collection = {}

def run_algorithm(tokens_input):
    global sense_vectors_collection
    sorted_sense_vectors_collection = {}
    pos_tags_input = nltk.pos_tag(tokens_input)
    
    pos = []
    pos_vectors = {}
    for word, pos_tag in pos_tags_input:
        if get_valid_pos_tag(pos_tag):
            try:
                pos_vectors[word] = glove[word]
                pos.append(word)
            except Exception:
                pass
                
    # Sense vectors init
    for p in pos:
        sense_vectors = get_word_sense_vectors(p)
        if sense_vectors is None:
            continue
        sense_vectors_collection[p] = sense_vectors
        sorted_sense_vectors_collection[p] = len(sense_vectors)
    
    # S2C sorting for content word
    sorted_sense_vectors_collection = sorted(sorted_sense_vectors_collection.items(), key=lambda x: x[1])
    
    # Context vector initialization
    context_vec = average(list(pos_vectors.values()), 0)
    
    for w, _ in sorted_sense_vectors_collection:
        disambiguation_results = disambiguate_word_sense(w, context_vec)
        disambiguated_sense = disambiguation_results[0]
        if disambiguated_sense is None:
            continue
        score_margin = disambiguation_results[1]
        if score_margin > score_margin_threshold:
            pos_vectors[w] = sense_vectors_collection[w][disambiguated_sense]
            context_vec = average(list(pos_vectors.values()), 0)
    sense_vectors_collection.clear()
    return context_vec

In [7]:
glove = load_glove_vectors()

In [8]:
get_word_sense_vectors('key')

{Lemma('key.n.01.key'): array([ 0.23398828,  0.06947961, -0.10388128, -0.0099533 ,  0.43481728,
         0.29401644,  0.20012272, -0.43976511, -0.13210233,  0.05348277,
         0.24428633,  0.05993198, -0.21467228, -0.06295217, -0.04845044,
         0.20116533, -0.09243417,  0.01657027,  0.18980996, -0.27478444,
        -0.02330133, -0.31139944, -0.29876339, -0.07223167,  0.08578778,
        -1.26486   , -0.35051844, -0.13795044,  0.29510317,  0.24838428,
         2.94410167, -0.24746111, -0.07286117, -0.72782778,  0.00712588,
        -0.03365111, -0.01599744,  0.18017395,  0.01516489,  0.35594933,
        -0.23689944, -0.13363701,  0.18609783,  0.42248206, -0.14563406,
         0.41616411, -0.05148722,  0.25225161,  0.07375754, -0.07387137]),
 Lemma('key.n.07.Key'): array([-0.01803425,  0.3279965 , -0.5628375 , -0.43857317,  0.40369858,
         0.27693233, -0.45595677,  0.14854517, -0.40123083,  0.36382783,
        -0.25543275,  0.24510917, -0.254656  , -0.18743725,  0.547935  ,
   

In [9]:
def str_to_list(value):
    list_values = value.strip('[]').split(', ')
    cleaned_list_values = [item[1:-1] for item in list_values]
    return cleaned_list_values

In [10]:
df1 = pd.read_csv("../datasets/tonetags_dataset_tumblr_clean.csv", converters={"text": str_to_list})

In [11]:
df2 = df1.copy()
df2["text"] = df2["text"].apply(lambda x:apply(x))

KeyboardInterrupt: 

In [11]:
df1.head()

Unnamed: 0,tags,text
0,genuine question,"[weve, talked, gifted, child, struggling, adul..."
1,genuine question,"[Is, concept, straight, girls, religiously, wa..."
2,genuine question,"[I, sincerely, curious, ., Has, ever, written,..."
3,genuine question,"[Bro, idk, people, look, fucking, good, fake, ..."
4,genuine question,"[Advice, beginner, witch, supplies, deal, peri..."


In [None]:
df2.to_csv("../datasets/tonetags_wsd_1.csv")

In [12]:
df3 = df1.copy()

In [13]:
df3["context"] = df3["text"].progress_apply(lambda x: run_algorithm(x))

  0%|          | 0/85304 [00:00<?, ?it/s]

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [14]:
df3.to_csv("../datasets/tonetags_wsd_2.csv")