In [None]:
import numpy as np
import os
from IMDBModel import IMDBModel
from embedding import Embedding
from keras.preprocessing import sequence
import time
from glove_utils import load_embedding
from data_utils import IMDBDataset
from pprint import pprint

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# use this when running the notebook remotely to redirect output
import sys
jupyter_stdout = sys.stdout # save jupyter's stdout
sys.stdout = open('/dev/stdout', 'w')
print('Printing in console', flush = True)

In [None]:
# Load GLoVe vectors
print('Loading GLoVe vectors...')
start_time = time.time()
GLOVE_FILENAME = 'data/glove.6B.300d.txt'
word2index, index2word, index2embedding = load_embedding(GLOVE_FILENAME)
print('Loaded %s word vectors in %f seconds' % (len(word2index), time.time() - start_time))
embedding = Embedding(word2index, index2word, index2embedding)

# Load counterfitted embeddings
print('Loading counter-fitted vectors...')
start_time = time.time()
COUNTERFITTED_GLOVE_FILENAME = 'data/counter-fitted-vectors-300.txt'
c_word2index, c_index2word, c_index2embedding = load_embedding(COUNTERFITTED_GLOVE_FILENAME)
print('Loaded %s word vectors in %f seconds' % (len(c_word2index), time.time() - start_time))
counter_embedding = Embedding(c_word2index, c_index2word, c_index2embedding)

# create joined representation of original GLoVe embedding with counterfitted vectors
synonyms_embedding = Embedding.replace_embeddings(embedding, counter_embedding)

In [None]:
# Load data
maxlen = 400
batch_size = 32
print('Loading data...')
(train_text, x_train, y_train), (test_text, x_test, y_test) = IMDBDataset.load_data()
x_train = sequence.pad_sequences(x_train, maxlen=maxlen, padding = 'pre', truncating = 'pre')
x_test = sequence.pad_sequences(x_test, maxlen=maxlen, padding = 'pre', truncating = 'pre')
print('Data loaded.')


In [None]:
# Build Frequency table
from collections import Counter
word_indexes_freqs = Counter()
for i in range(len(x_train)):
    word_indexes_freqs+= Counter(x_train[i])
for i in range(len(x_test)):
    word_indexes_freqs+= Counter(x_test[i])


In [None]:
len(word_indexes_freqs)

In [None]:
most_common_indexes = word_indexes_freqs.most_common()

In [None]:
most_common_words = [(embedding.index2word[index], freq) for (index, freq) in most_common_indexes]

In [None]:
most_common_words[0:20]

In [None]:
# location to save synonyms dict 
import pickle
prefix = 'data/syn_dict/'
if not os.path.exists(prefix):
    print('Creating directory ',prefix)
    os.mkdir(prefix)

handle = ''

In [None]:
# Cache nearest neighbors for 25 words each time
batch_size = 25
synonyms_dict = dict()
distances_dict = dict()
for i in range(0, len(most_common_words), batch_size):
    common_words = most_common_words[i:i+batch_size]
    words = [word for (word,_) in common_words]
    print("Getting the nearest neighbors for the following words: ", words, flush = True)
    start_time = time.time()
    synonyms_map, distances_map = synonyms_embedding.build_neighbors_map(words, N = 30, return_distances = True)
    synonyms_dict = {**synonyms_dict, **synonyms_map}
    distances_dict = {**distances_dict, **distances_map}
    print("Built synonyms_dict in ", time.time() - start_time, " seconds" , flush = True)
    print("Saving synonyms_dict_%d" % (i//batch_size), flush = True)
    print(40*'-')
    syn_file = open(prefix+'syn_dict_'+handle+'.pickle', 'wb')
    pickle.dump(synonyms_dict, syn_file)
    syn_file.close()
    dist_file = open(prefix + 'dist_dict_'+handle+'.pickle', 'wb')
    pickle.dump(distances_dict, dist_file)
    dist_file.close

In [None]:
print("Finished")

In [None]:
infile = open(prefix + 'syn_dict_'+handle+ '.pickle','rb')
new_dict = pickle.load(infile)
infile.close()

In [None]:
pprint(new_dict)

In [None]:
infile = open(prefix + 'dist_dict_'+handle+'.pickle','rb')
dist_dict = pickle.load(infile)
infile.close()

In [None]:
pprint(dist_dict)