In [None]:
import sys

import numpy as np

np.random.seed(42)

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant
import pandas as pd
import gensim

In [None]:
# In case your sys.path does not contain the base repo, cd there.
print(sys.path)
%cd 'PATH_OF_BASE_REPO'  # In the solution it will be the path to my repo. This is such that python loads al the files from the top.

In [None]:
path = 'dataset/docv2_train_queries.tsv'
queries = pd.read_csv(path, sep='\t', lineterminator='\r', names=['query_id', 'query'])[:20000]
queries.head()


In [None]:
corpus = [sentence for sentence in queries['query'].values if type(sentence) == str and len(sentence.split(' ')) >= 3]

In [None]:
# We load the pretrained embedding
path_to_glove_file = "./dataset/glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 3
epochs=50
batch_size = 5000
BATCH = False


In [None]:
print(f'First 5 corpus items are {corpus[:5]}')
print(f'Length of corpus is {len(corpus)}')

Now comes the interesting part, we need to construct a matrix of `V+1 x dim` and for each word in the tokenizer, try to get it from the embedding. If it doesn't exist then just fill it with zeros. 

In [None]:
num_tokens = V + 1
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = None
for word, i in tokenizer.word_index.items():
    embedding_vector = None  # Get the embedding vector from the GloVe embedding
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))



cbow = Sequential()
cbow.add()  # Add the same Embedding as before, but the embeddings initializer will be the embedding matrix we have built, and trainable to False. This way we start from the pretrained embedding.
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

In [None]:
cbow.compile(loss='categorical_crossentropy', optimizer='adam')


In [None]:
cbow.summary()

Notice the Non-trainable parameters! What we are doing is just training the softmax based on correct embeddings. This is called fine tuning the embedding.

In [None]:
def generate_data(corpus, window_size, V, batch_size=batch_size):
    number_of_batches = (len(corpus) // batch_size) + 1
    for batch in range(number_of_batches):
        lower_end = batch*batch_size
        upper_end = (batch+1)*batch_size if batch+1 < number_of_batches else len(corpus)
        mini_batch_size = upper_end - lower_end
        maxlen = window_size*2
        X = np.zeros((mini_batch_size, maxlen))
        Y = np.zeros((mini_batch_size, V))
        for query_id, words in enumerate(corpus[lower_end:upper_end]):
            L = len(words)
            for index, word in enumerate(words):
                contexts = []
                labels   = []            
                s = index - window_size
                e = index + window_size + 1

                contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
                labels.append(word)

                x = sequence.pad_sequences(contexts, maxlen=maxlen)
                y = np_utils.to_categorical(labels, V)
                X[query_id] = x
                Y[query_id] = y
        yield (X, Y)



In [None]:
# If data is small, you can just generate the whole dataset and load it in memory to use the fit method
#
if not BATCH:
    def generate_data(corpus, window_size, V):
            maxlen = window_size*2
            X = np.zeros((len(corpus), maxlen))
            Y = np.zeros((len(corpus), V))
            for query_id, words in enumerate(corpus):
                L = len(words)
                for index, word in enumerate(words):
                    contexts = []
                    labels   = []            
                    s = index - window_size
                    e = index + window_size + 1

                    contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
                    labels.append(word)

                    x = sequence.pad_sequences(contexts, maxlen=maxlen)
                    y = np_utils.to_categorical(labels, V)
                    X[query_id] = x
                    Y[query_id] = y
            return (X, Y)


In [None]:
def fit_model():
    if not BATCH:
        X, Y = generate_data(corpus, window_size, V)
        print(f'Size of X is {X.shape} and Y is {Y.shape}')
        cbow.fit(X, Y, epochs = epochs)
    else:
        index = 1
        for x, y in generate_data(corpus, window_size, V):
            print(f'Training on Iteration: {index}')
            index += 1
            history = cbow.train_on_batch(x, y, reset_metrics=False, return_dict=True)
            print(history)

In [None]:
fit_model()

In [None]:
with open('./1-synonyms/lab2/vectors.txt' ,'w') as f:
    f.write('{} {}\n'.format(V-1, dim))
    vectors = cbow.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        str_vec = ' '.join(map(str, list(vectors[i, :])))
        f.write('{} {}\n'.format(word, str_vec))


In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./1-synonyms/lab2/vectors.txt', binary=False)

In [None]:
w2v.most_similar(positive=['gasoline'])

In [None]:
w2v.most_similar(positive=['grape'])

Do you notice the difference in the accuracy? For any task first search if there are any pretrained models to use!