In [1]:
import sys

import numpy as np

np.random.seed(42)

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant
import pandas as pd
import gensim

In [2]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

['/Users/axelsirota/repos/ml-solr-course/1-synonyms/lab2/solutions', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python37.zip', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7/lib-dynload', '', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages/IPython/extensions', '/Users/axelsirota/.ipython']
/Users/axelsirota/repos/ml-solr-course


In [3]:
path = 'dataset/docv2_train_queries.tsv'
queries = pd.read_csv(path, sep='\t', lineterminator='\r', names=['query_id', 'query'])[:20000]
queries.head()


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,query_id,query
0,121352,define extreme
1,510633,tattoo fixers how much does it cost
2,674172,what is a bank transit number
3,570009,what are the four major groups of elements
4,54528,blood clots in urine after menopause


In [4]:
corpus = [sentence for sentence in queries['query'].values if type(sentence) == str and len(sentence.split(' ')) >= 3]

In [5]:
path_to_glove_file = "./dataset/glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400001 word vectors.


In [6]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 3
epochs=50
batch_size = 5000
BATCH = False


In [7]:
print(f'First 5 corpus items are {corpus[:5]}')
print(f'Length of corpus is {len(corpus)}')

First 5 corpus items are [[1129, 7060, 6, 23, 9, 29, 21], [1, 2, 5, 296, 3382, 40], [1, 11, 3, 613, 371, 2691, 4, 614], [56, 2231, 7, 199, 89, 927], [1, 2, 1432, 7061]]
Length of corpus is 19403


In [8]:
num_tokens = V + 1
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))



Converted 15878 words (1324 misses)


In [9]:
cbow = Sequential()
cbow.add(Embedding(input_dim=num_tokens, output_dim=dim, input_length=window_size*2, embeddings_initializer=Constant(embedding_matrix),
    trainable=False))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

2021-07-23 19:29:22.210476: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
cbow.compile(loss='categorical_crossentropy', optimizer='adam')


In [11]:
cbow.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 100)            1720400   
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 17203)             1737503   
Total params: 3,457,903
Trainable params: 1,737,503
Non-trainable params: 1,720,400
_________________________________________________________________


In [12]:
def generate_data(corpus, window_size, V, batch_size=batch_size):
    number_of_batches = (len(corpus) // batch_size) + 1
    for batch in range(number_of_batches):
        lower_end = batch*batch_size
        upper_end = (batch+1)*batch_size if batch+1 < number_of_batches else len(corpus)
        mini_batch_size = upper_end - lower_end
        maxlen = window_size*2
        X = np.zeros((mini_batch_size, maxlen))
        Y = np.zeros((mini_batch_size, V))
        for query_id, words in enumerate(corpus[lower_end:upper_end]):
            L = len(words)
            for index, word in enumerate(words):
                contexts = []
                labels   = []            
                s = index - window_size
                e = index + window_size + 1

                contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
                labels.append(word)

                x = sequence.pad_sequences(contexts, maxlen=maxlen)
                y = np_utils.to_categorical(labels, V)
                X[query_id] = x
                Y[query_id] = y
        yield (X, Y)



In [13]:
# If data is small, you can just generate the whole dataset and load it in memory to use the fit method
#
if not BATCH:
    def generate_data(corpus, window_size, V):
            maxlen = window_size*2
            X = np.zeros((len(corpus), maxlen))
            Y = np.zeros((len(corpus), V))
            for query_id, words in enumerate(corpus):
                L = len(words)
                for index, word in enumerate(words):
                    contexts = []
                    labels   = []            
                    s = index - window_size
                    e = index + window_size + 1

                    contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
                    labels.append(word)

                    x = sequence.pad_sequences(contexts, maxlen=maxlen)
                    y = np_utils.to_categorical(labels, V)
                    X[query_id] = x
                    Y[query_id] = y
            return (X, Y)


In [14]:
def fit_model():
    if not BATCH:
        X, Y = generate_data(corpus, window_size, V)
        print(f'Size of X is {X.shape} and Y is {Y.shape}')
        cbow.fit(X, Y, epochs = epochs)
    else:
        index = 1
        for x, y in generate_data(corpus, window_size, V):
            print(f'Training on Iteration: {index}')
            index += 1
            history = cbow.train_on_batch(x, y, reset_metrics=False, return_dict=True)
            print(history)

In [15]:
fit_model()

Size of X is (19403, 6) and Y is (19403, 17203)


2021-07-23 19:29:29.200362: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
with open('./1-synonyms/lab2/vectors.txt' ,'w') as f:
    f.write('{} {}\n'.format(V-1, dim))
    vectors = cbow.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        str_vec = ' '.join(map(str, list(vectors[i, :])))
        f.write('{} {}\n'.format(word, str_vec))


In [21]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./1-synonyms/lab2/vectors.txt', binary=False)

In [22]:
w2v.most_similar(positive=['gasoline'])

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


[('petrol', 0.8148176670074463),
 ('fuel', 0.8057637214660645),
 ('heating', 0.7722175717353821),
 ('crude', 0.7561777234077454),
 ('diesel', 0.755372941493988),
 ('prices', 0.7397462725639343),
 ('gallon', 0.7387636303901672),
 ('gas', 0.7218859195709229),
 ('fuels', 0.6957605481147766),
 ('oil', 0.6737465858459473)]

In [23]:
w2v.most_similar(positive=['grape'])

[('grapes', 0.7915149927139282),
 ('wine', 0.7132691144943237),
 ('varieties', 0.7115378975868225),
 ('chardonnay', 0.7091464996337891),
 ('pinot', 0.7055837512016296),
 ('vines', 0.681800127029419),
 ('vine', 0.6526992917060852),
 ('fruit', 0.6477821469306946),
 ('tomato', 0.6304808855056763),
 ('citrus', 0.6102907657623291)]