In [1]:
import sys

import numpy as np

np.random.seed(42)

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import pandas as pd
import gensim

In [2]:
# In case your sys.path does not contain the base repo, go there.
print(sys.path)
%cd '/Users/axelsirota/repos/ml-solr-course'

['/Users/axelsirota/repos/ml-solr-course/1-synonyms/lab1/solutions', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python37.zip', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7', '/Users/axelsirota/.pyenv/versions/3.7.3/lib/python3.7/lib-dynload', '', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages', '/Users/axelsirota/repos/ml-solr-course/.venv/lib/python3.7/site-packages/IPython/extensions', '/Users/axelsirota/.ipython']
/Users/axelsirota/repos/ml-solr-course


In [3]:
path = 'dataset/docv2_train_queries.tsv'
queries = pd.read_csv(path, sep='\t', lineterminator='\r', names=['query_id', 'query'])
queries.head()


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,query_id,query
0,121352,define extreme
1,510633,tattoo fixers how much does it cost
2,674172,what is a bank transit number
3,570009,what are the four major groups of elements
4,54528,blood clots in urine after menopause


In [4]:
corpus = [sentence for sentence in queries['query'].values if type(sentence) == str and len(sentence.split(' ')) >= 3]

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 3
epochs=50
batch_size = 1000
BATCH = True


In [6]:
print(f'First 5 corpus items are {corpus[:5]}')
print(f'Length of corpus is {len(corpus)}')


First 5 corpus items are [[1528, 38656, 6, 24, 9, 31, 20], [1, 2, 5, 341, 4411, 43], [1, 11, 3, 913, 329, 2058, 4, 757], [53, 4020, 7, 239, 82, 1948], [1, 2, 1090, 38657]]
Length of corpus is 312265


In [7]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

2021-07-23 19:20:24.823118: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
cbow.compile(loss='categorical_crossentropy', optimizer='adam')
cbow.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 100)            7644400   
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 76444)             7720844   
Total params: 15,365,244
Trainable params: 15,365,244
Non-trainable params: 0
_________________________________________________________________


In [9]:
def generate_data(corpus, window_size, V, batch_size=batch_size):
    number_of_batches = (len(corpus) // batch_size) + 1
    for batch in range(number_of_batches):
        lower_end = batch*batch_size
        upper_end = (batch+1)*batch_size if batch+1 < number_of_batches else len(corpus)
        mini_batch_size = upper_end - lower_end
        maxlen = window_size*2
        X = np.zeros((mini_batch_size, maxlen))
        Y = np.zeros((mini_batch_size, V))
        for query_id, words in enumerate(corpus[lower_end:upper_end]):
            L = len(words)
            for index, word in enumerate(words):
                contexts = []
                labels   = []            
                s = index - window_size
                e = index + window_size + 1

                contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
                labels.append(word)

                x = sequence.pad_sequences(contexts, maxlen=maxlen)
                y = np_utils.to_categorical(labels, V)
                X[query_id] = x
                Y[query_id] = y
        yield (X, Y)



In [10]:
# If data is small, you can just generate the whole dataset and load it in memory to use the fit method
#
# def generate_data(corpus, window_size, V):
#         maxlen = window_size*2
#         X = np.zeros((len(corpus), maxlen))
#         Y = np.zeros((len(corpus), V))
#         for query_id, words in enumerate(corpus):
#             L = len(words)
#             for index, word in enumerate(words):
#                 contexts = []
#                 labels   = []            
#                 s = index - window_size
#                 e = index + window_size + 1

#                 contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
#                 labels.append(word)

#                 x = sequence.pad_sequences(contexts, maxlen=maxlen)
#                 y = np_utils.to_categorical(labels, V)
#                 X[query_id] = x
#                 Y[query_id] = y
#         return (X, Y)


In [11]:
def fit_model():
    if not BATCH:
        X, Y = generate_data(corpus, window_size, V)
        print(f'Size of X is {X.shape} and Y is {Y.shape}')
        cbow.fit(X, Y, epochs = epochs)
    else:
        index = 1
        for x, y in generate_data(corpus, window_size, V):
            print(f'Training on Iteration: {index}')
            index += 1
            history = cbow.train_on_batch(x, y, reset_metrics=False, return_dict=True)
            print(history)
            if index > epochs:
                break

In [None]:
fit_model()

Training on Iteration: 1


2021-07-23 19:20:36.296448: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


{'loss': 11.24425983428955}
Training on Iteration: 2
{'loss': 11.243762016296387}
Training on Iteration: 3
{'loss': 11.243168830871582}
Training on Iteration: 4
{'loss': 11.242558479309082}
Training on Iteration: 5
{'loss': 11.241923332214355}
Training on Iteration: 6
{'loss': 11.2412691116333}
Training on Iteration: 7
{'loss': 11.240599632263184}
Training on Iteration: 8
{'loss': 11.239843368530273}
Training on Iteration: 9
{'loss': 11.239004135131836}
Training on Iteration: 10
{'loss': 11.238199234008789}
Training on Iteration: 11
{'loss': 11.23741340637207}
Training on Iteration: 12
{'loss': 11.236527442932129}
Training on Iteration: 13
{'loss': 11.235565185546875}
Training on Iteration: 14
{'loss': 11.23462200164795}
Training on Iteration: 15
{'loss': 11.233726501464844}
Training on Iteration: 16
{'loss': 11.232732772827148}
Training on Iteration: 17
{'loss': 11.231771469116211}
Training on Iteration: 18
{'loss': 11.230607032775879}
Training on Iteration: 19
{'loss': 11.22945976257

Training on Iteration: 151
{'loss': 10.492704391479492}
Training on Iteration: 152
{'loss': 10.483600616455078}
Training on Iteration: 153
{'loss': 10.47494888305664}
Training on Iteration: 154
{'loss': 10.466867446899414}
Training on Iteration: 155
{'loss': 10.458367347717285}
Training on Iteration: 156
{'loss': 10.449788093566895}
Training on Iteration: 157
{'loss': 10.441794395446777}
Training on Iteration: 158
{'loss': 10.432795524597168}
Training on Iteration: 159
{'loss': 10.424227714538574}
Training on Iteration: 160
{'loss': 10.415639877319336}
Training on Iteration: 161
{'loss': 10.406964302062988}
Training on Iteration: 162
{'loss': 10.39923095703125}
Training on Iteration: 163
{'loss': 10.390838623046875}
Training on Iteration: 164
{'loss': 10.382746696472168}
Training on Iteration: 165
{'loss': 10.374244689941406}
Training on Iteration: 166
{'loss': 10.36579418182373}
Training on Iteration: 167
{'loss': 10.357924461364746}
Training on Iteration: 168
{'loss': 10.349378585815

In [None]:
with open('./1-synonyms/lab1/vectors.txt' ,'w') as f:
    f.write('{} {}\n'.format(V-1, dim))
    vectors = cbow.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        str_vec = ' '.join(map(str, list(vectors[i, :])))
        f.write('{} {}\n'.format(word, str_vec))


In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./1-synonyms/lab1/vectors.txt', binary=False)

In [None]:
w2v.most_similar(positive=['gasoline'])

In [None]:
w2v.most_similar(negative=['apple'])