# Part 3: Word2vec Improvement

## Import neccessary packages

In [1]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import wordnet as wn
import numpy as np
from numpy import average

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy import spatial
import os
import argparse

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


## Improve the word embedding from Part 2 with word sense

In [2]:
# Get the line from the input file as list
def get_txt_lines(path):
    with open(path, encoding="utf8") as f:
        lines = f.readlines()
    lines = [w.strip() for w in lines]
    return lines

In [3]:
# Post tag to words in line with properties, noun, verb, adj, adv
def valid_pos_tag(tag):
    if tag.startswith('J') or tag.startswith('V') or tag.startswith('N') or tag.startswith('R'):
        return True
    return False

In [4]:
# Calculate cosine simialrity
def cosine_similarity(w1, w2):
    cosine_sim = 1 - spatial.distance.cosine(w1, w2)
    return cosine_sim

In [102]:
# Function to firstly tokenize input sentence, remove stopwods, get the sense vector and update embedding based on context vector
def sentence_disambiguation(sentence, embed, vocab, cosine_sim_threshold, score_margin_threshold):
    def lookup_table(word):
        return embed[vocab.index(word)]

    stop_words = set(stopwords.words('english'))
    sentence = sentence.lower()
    word_tokens = word_tokenize(sentence)
    # store the original sentence
    original_tokens = word_tokens
    original_embed = [lookup_table(w) for w in word_tokens]
    if len(word_tokens) == 0:
        raise Exception('A sentence can\'t be empty')
    definitions_vocab = {}
    # remove stop words
    word_tokens = [w for w in word_tokens if w not in stop_words]
    # get the pos-tag for each word
    tags = nltk.pos_tag(word_tokens)
    # filter words keeping only nouns (N), verbs (V), adjective (J) and adverbs (R)
    word_tokens = [word for word, tag in tags if valid_pos_tag(tag)]
    # get the context vector of the current sentence as the average of all of its words
    try:
        embed_words = [lookup_table(w) for w in word_tokens]
    except Exception:
        raise Exception('The sentence contains unknown words')
    # compute the context vector
    context_vec = average(embed_words, 0)
    # from left to right disambiguate each word
    word_tokens_count = -1
    for i, word in enumerate(original_tokens):
        if word not in word_tokens:
            continue
        word_tokens_count += 1
        # dictionary: {sense: sense_vector}
        syn_vectors = {}
        # dictionary: {sense: cosine_similarity}
        cos_vectors = {}
        # dictionary: {sense: Lemma(sense}
        lemma_vectors = {}
        for sense in wn.lemmas(word):
            # gloss is a list containing the definition of each sense and some examples
            gloss = [sense.synset().definition()]
            gloss.extend(sense.synset().examples())
            # get all words contained in the gloss
            gloss_tokens = nltk.word_tokenize(" ".join(gloss))
            gloss_tags = nltk.pos_tag(gloss_tokens)
            # filter words gloss keeping only nouns (N), verbs (V), adjective (J) and adverbs (R)
            gloss_tokens = [word for word, tag in gloss_tags if valid_pos_tag(tag)]
            # correlation: cosine_similarity >= cosine_sim_threshold
            sense_word_vectors = []
            for t in gloss_tokens:
                try:
                    gloss_word_vec = lookup_table(t)
                except Exception:
                    continue
                # cosine similarity between the embedding of a gloss word and the target word
                cos_sim = cosine_similarity(gloss_word_vec, original_embed[i])
                if cos_sim >= cosine_sim_threshold:
                    sense_word_vectors.append(gloss_word_vec)
            if len(sense_word_vectors) == 0:
                continue
            # get the average of these word vectors and append it to the senses dictionary with its corresponding
            # cosine similarity with the context vector
            sense_vector = average(sense_word_vectors, 0)
            syn_vectors[str(sense)] = sense_vector
            cos_vectors[str(sense)] = cosine_similarity(sense_vector, context_vec)
            lemma_vectors[str(sense)] = sense

        if len(syn_vectors) == 0:
            continue
        sorted_list = sorted(cos_vectors.items(), key=lambda x: x[1])
        # find the sense vector in the dictionary that is closer to the context vector
        most_similar_pair = sorted_list.pop()
        disambiguated_sense = most_similar_pair[0]
        cos_sim_second_most_similar_sense = 0
        if len(sorted_list) > 0:
            cos_sim_second_most_similar_sense = sorted_list.pop()[1]
        score_margin = most_similar_pair[1] - cos_sim_second_most_similar_sense
        # if there are more senses, make sure the first choice is much better than the second best one
        if score_margin >= score_margin_threshold:
            # replace the sense vector in the word embedding
            if word_tokens_count >= len(embed_words):
                continue
            embed_words[word_tokens_count] = syn_vectors[disambiguated_sense]
            original_embed[i] = syn_vectors[disambiguated_sense]
            embed[vocab.index(word)] = original_embed[i]
            # recompute the new context vector
            context_vec = average(embed_words, 0)
            definitions_vocab[i] = lemma_vectors[disambiguated_sense].synset().definition()

    return original_embed, definitions_vocab

In [87]:
embed_path = 'E:/Word2vec/embeddings_wiki.npy'
vocab_path = 'E:/Word2vec/vocab_300.txt'
wiki_path = 'E:/sense_wiki.txt'
rating_path = 'E:/Word2vec/SCWS/ratings.txt'

In [88]:
vocab = get_txt_lines(vocab_path)
embed = np.load(embed_path)
rating = get_txt_lines(rating_path)
wiki = get_txt_lines(wiki_path)

In [67]:
def lookup_table(word):
    return embed[vocab.index(word)]

In [112]:
# Set the threshold for updating embedding 
cosine_sim_threshold=0
score_margin_threshold=0.1

In [114]:
# update the original word embedding with word sense disambiguition and break when the number of line reaches 30000
sentence = 0
for line in wiki:
    if sentence == 30000:
        break
    sentence += 1
    line = [splits for splits in line.split("\t") if splits is not ""]
    sentence1 = line.lower()
    sentence1_split = [splits for splits in sentence1.split(" ") if splits in vocab]
    sentence1_clean = " ".join(sentence1_split)
    e1, _ = sentence_disambiguation(sentence1_clean, embed, vocab, cosine_sim_threshold, score_margin_threshold)

## Evaluation on Spearman Correlation

In [91]:
import pandas as pd
from scipy.stats import spearmanr
from scipy import spatial
import numpy as np
import logging
from six import iteritems
import pandas as pd
from itertools import chain
import argparse
import os

In [92]:
def _read_as_pd(file):
    return pd.read_csv(file)

In [93]:
data = _read_as_pd('E:/Word2vec/combined.csv')

In [94]:
X = data.values[:, 0:2]

In [95]:
y = data.values[:, 2].astype(np.float)

In [105]:
spearman_errors = []
cosine_errors = []
name = 'WS353'
print("----------SIMILARITY----------")
spearman_err = 0
cosine_err = 0
analogies = 0
for i in range(len(X)):
    word1, word2 = X[i][0], X[i][1]
    if word1 not in vocab or word2 not in vocab:
        continue

    spearman_corr, _ = spearmanr(lookup_table(word1), lookup_table(word2))
    spearman_corr = abs(spearman_corr)
    spearman_err += abs(spearman_corr - y[i] / 10)

    cosine_sim = 1 - spatial.distance.cosine(lookup_table(word1), lookup_table(word2))
    cosine_err += abs(cosine_sim - y[i] / 10)
    # print(word1, word2, data.y[i], cosine_sim)

    analogies += 1
spearman_err = 1 - spearman_err / analogies
cosine_err = 1 - cosine_err / analogies
spearman_errors.append(spearman_err)
cosine_errors.append(cosine_err)
print("Spearman correlation error on {} dataset: {}".format(name, spearman_err))
print("Cosine similarity error on {} dataset: {}".format(name, cosine_err))

----------SIMILARITY----------
Spearman correlation error on WS353 dataset: 0.7664052805893533
Cosine similarity error on WS353 dataset: 0.7648871485857822


## Evaluation on SCWS dataset

In [104]:
tot_err = 0
sentences = 0
for line in rating:
    line = [splits for splits in line.split("\t") if splits is not ""]
    # get the word and line in each line separately
    w1 = line[1].lower()
    w2 = line[3].lower()
    if w1 not in vocab or w2 not in vocab:
        continue
    sentence1 = line[5].lower()
    sentence2 = line[6].lower()
    sentence1_split = [splits for splits in sentence1.split(" ") if splits in vocab]
    sentence2_split = [splits for splits in sentence2.split(" ") if splits in vocab]

    sentence1_clean = " ".join(sentence1_split)
    sentence2_clean = " ".join(sentence2_split)
    # get the word index for look up 
    idx_w1 = sentence1_split.index(w1)
    idx_w2 = sentence2_split.index(w2)
    # get the disambiguated embedding for each sentence
    e1, _ = sentence_disambiguation(sentence1_clean, embed, vocab, cosine_sim_threshold, score_margin_threshold)
    sense1 = e1[idx_w1]
    e2, _ = sentence_disambiguation(sentence2_clean, embed, vocab, cosine_sim_threshold, score_margin_threshold)
    sense2 = e2[idx_w2]
    # compare the distance between two sense vectors
    dist1_2 = abs(cosine_similarity(sense1, sense2))

    values = line[-11:-1]
    values = [float(x) for x in values]
    real_value = average(values, 0) / 10
    err = abs(dist1_2 - real_value)
    tot_err += err
    print(sentences, w1, w2, dist1_2, real_value, err)
    sentences += 1

print("Similarity on SCWS dataset: ", 1 - tot_err / sentences)

0 brazil nut 0.4206601679325104 0.121 0.2996601679325104
1 brazil triple 0.6368697881698608 0.06599999999999999 0.5708697881698609
2 harvard yale 0.9672539830207825 0.5740000000000001 0.3932539830207824
3 harvard cambridge 0.8594754338264465 0.821 0.03847543382644658
4 israel east 0.7577135562896729 0.21100000000000002 0.5467135562896728
5 israel israeli 0.8332198858261108 0.8109999999999999 0.022219885826110897
6 japanese american 0.892179012298584 0.35 0.542179012298584
7 jerusalem israel 0.8633813858032227 0.812 0.0513813858032226
8 jerusalem wall 0.865024745464325 0.032999999999999995 0.8320247454643249
9 mexico brazil 0.9576384425163269 0.42400000000000004 0.5336384425163269
10 mexico puebla 0.9329000115394592 0.5599999999999999 0.3729000115394593
11 opec saudi 0.9188096523284912 0.55 0.36880965232849117
12 palestinian arab 0.8685761094093323 0.441 0.4275761094093323
13 wednesday news 0.8457539677619934 0.21600000000000003 0.6297539677619934
14 wednesday weekday 0.9778733849525452

In [106]:
np.save('E:/Word2vec/embeddings_sense.npy', embed)