# Retrofitted Word Embeddings:

In [110]:
# Imports:
import numpy as np
import pandas as pd
import io
import jsonlines
import json

from GPTModels import GPTTreeModel, GPTTreeModelV2

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.decomposition import PCA

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [2]:
e_file1 = "fasttext_word_vectors_wiki_news.vec"
e_file2 = "retrofitted_fasttext_word_vectors_wiki.vec"
product_taxomomy_file = "files/product_taxonomy.txt"

In [3]:
glove_file = e_file1
word2vec_glove_file = get_tmpfile("fasttext_wiki_news.txt")
glove2word2vec(glove_file, word2vec_glove_file)

(999994, 300)

In [4]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [5]:
glove_file2 = e_file2
word2vec_glove_file2 = get_tmpfile("retrofitted_fasttext_wiki_news.txt")
glove2word2vec(glove_file2, word2vec_glove_file2)

(830461, 300)

In [6]:
model2 = KeyedVectors.load_word2vec_format(word2vec_glove_file2)

In [35]:
model3 = GPTTreeModel(product_taxomomy_file)

In [36]:
model4 = GPTTreeModelV2(product_taxomomy_file)

# Nearest Neighbors:

* model = Represents the pretrained fast text embeddings.
* model2 = Represents the retrofitted fast text embeddings (retrofitted by using the GPT tree).
* model3 = Represents the basic tree model of the GPT tree. (where nodes are word tokens).
* model4 = Represents the basic tree model of the GPT tree. (where nodes are categories (multi-word tokens)).

In [93]:
sample_word_list = ["wallet", "dresses", "sneakers", "eyelash", "nail", "headphone", "jewelry", "bridal", "medical", "industrial", "circuit", "beverage"]

In [107]:
def getNeighbors(word_list, num_neighbors):
    neighbors = {}
    for cur_word in word_list:
        cur_dict = {}
        cur_dict["fast_text_base_model"] = list(model.most_similar(cur_word, topn=num_neighbors))
        cur_dict["fast_text_retrofitted_model"] = list(model2.most_similar(cur_word, topn=num_neighbors))
        cur_dict["gpt_tree_word_token_model"] = list(model3.most_similar(cur_word, num_neighbors))
        cur_dict["gpt_tree_category_token_model"] = list(model4.most_similar(cur_word, num_neighbors))
        neighbors[cur_word] = cur_dict
    return neighbors

In [112]:
neighbors_results = getNeighbors(sample_word_list, 10)

In [119]:
for key, val in neighbors_results.items():
    print("Query Word = ", key)
    for k, v in val.items():
        print(k, " : ", v)
    print("\n\n")

Query Word =  wallet
fast_text_base_model  :  [('wallets', 0.8026574850082397), ('billfold', 0.7117377519607544), ('purse', 0.6554557681083679), ('briefcase', 0.6548330783843994), ('suitcase', 0.6407263278961182), ('pocketbook', 0.6318467855453491), ('handbag', 0.628136396408081), ('bag', 0.6226708889007568), ('pocket', 0.6206861138343811), ('Wallet', 0.6022135019302368)]
fast_text_retrofitted_model  :  [('trainers', 0.9241711497306824), ('collar', 0.9241073131561279), ('closet', 0.9238914847373962), ('gloves', 0.923241913318634), ('pockets', 0.9232417345046997), ('bathroom', 0.9217934608459473), ('supplies', 0.921453595161438), ('balloon', 0.9209336042404175), ('dummies', 0.9207550287246704), ('towel', 0.9199435710906982)]
gpt_tree_word_token_model  :  ['keychains', 'chains', 'checkbook', 'wallet', 'accessories', 'apparel', 'handbag', 'lanyards', 'covers']
gpt_tree_category_token_model  :  ['business card cases', 'apparel & accessories', 'checkbook covers', 'keychains', 'badge & pass 

In [120]:
results_file_name = "retrofitting_models_neighbors_comparison.json"
with open(results_file_name, "w") as f:
    json.dump(neighbors_results, f)