## Prepare Referece files using Word2Vec for retrieving attributes


In [1]:
import gensim.downloader as api
import numpy as np

wv = api.load("word2vec-google-news-300")

In [2]:
# test
print(wv.most_similar(positive=["king", "woman"], negative=["man"], topn=5))

[('queen', 0.7118192911148071), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133)]


In [4]:
positive_words = ["good", "great", "beautiful", "positive", "delicious"]
negative_words = ["bad", "worse", "awful", "negative", "disgusting"]

v_positive = np.average(np.array(list(map(lambda x: wv[x], positive_words))), axis=0)
v_negative = np.average(np.array(list(map(lambda x: wv[x], negative_words))), axis=0)

In [17]:
from typing import List


def retrieve_antonyms(attributes: List[str], to_positive: bool = True) -> List[str]:
    transferred_attributes = list()
    for attribute in attributes:
        if to_positive:
            v_query = wv[attribute] + v_positive - v_negative
        else:
            v_query = wv[attribute] - v_positive + v_negative
        antonym = wv.most_similar(positive=[v_query], topn=1)
        transferred_attributes.append(antonym[0][0])
    return transferred_attributes

In [18]:
# test
print(retrieve_antonyms(["sadly", "noisy", "hungry"]))

['sadly', 'noisy', 'hungry']


In [None]:
def read_file(path):
    with open(path) as fp:
        lines = fp.read().splitlines()
    return lines

In [None]:
def create_file(input_file, output_file, to_positive=True):
    with open(output_file, "w") as out_fp:
        input_lines = read_file(input_file)
        for input_line in input_lines:
            # each input line looks like this:
            # [ATTRS]delicious<CONT_START>I never tasted such <REPLACE> soup.<END>
            splits = input_line.split("<CONT_START>")
            attributes = splits[0].remove("[ATTRS]")
            antonyms = retrieve_antonyms(attributes, to_positive)
            output_line = "[ATTRS]" + " ".join(antonyms) + splits[1]
            out_fp.write(output_line)

In [None]:
data_dir = "/home/jack/Desktop/NN/clean/datasets/yelp"

reference_0 = data_dir + "/processed_files_with_bert_with_best_head/reference_0.txt"
reference_1 = data_dir + "/processed_files_with_bert_with_best_head/reference_1.txt"
out_reference_0 = data_dir + "/processed_files_with_bert_with_best_head/word2vec/reference_0.txt"
out_reference_1 = data_dir + "/processed_files_with_bert_with_best_head/word2vec/reference_1.txt"

create_file(reference_0, out_reference_0, True)
create_file(reference_1, out_reference_1, False)