In [1]:
# Train word2vec from scratch
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential 
from tensorflow.keras.layers import Input, Dense, Reshape,Embedding,dot
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from helper import build_dataset
import numpy as np

In [2]:
# For this example, let us consider the sentence from the lecture 
raw_data = "Ignacio was hit by a red bus a while ago"
corpus = raw_data.split()

In [3]:
# We define the parameters for our skipgram with negative sampling 
window_size = 2
vector_dim = 300
vocab_size = len(set(corpus))+1

# Use the helper function to convert the corpus to sequential data
data, count, dictionary, reverse_dictionary = build_dataset(corpus,vocab_size)

In [4]:
# We use the `skipgrams` function from tensorflow.keras
# to build the training dataset
couples, labels = skipgrams(data, vocab_size, window_size=window_size)
word_target, word_context = zip(*couples)
# word_target = np.array(word_target, dtype="int32")
# word_context = np.array(word_context, dtype="int32")
print(couples[:5], labels[:5])

[[1, 6], [8, 9], [6, 5], [7, 1], [5, 5]] [1, 1, 1, 1, 0]


### ⏸ Which number from the `data` variable corresponds to **Ignacio**?


#### A. 5
#### B. 2
#### C. [1,0,0,0,0,1]
#### D. [0,1,0,0,0]

In [5]:
### edTest(test_chow1) ###
# Submit an answer choice as a string below (eg. if you choose option A, put 'A')
answer1 = 'B'

### Word2Vec Skipgram with Negative sampling Model

In [6]:
# The following code builds the SGNS word2vec architecture

# Here we build the target word embedding
word_model = Sequential() 
word_model.add(Embedding(vocab_size, vector_dim, 
               input_length=1,name='embedding')) 
word_model.add(Reshape((vector_dim, ))) 

# Here we build the context word embedding
context_model = Sequential() 
context_model.add(Embedding(vocab_size, vector_dim, 
                  input_length=1,name='context')) 
context_model.add(Reshape((vector_dim,))) 

# Here we take the dot product of the the target and context word

dot_product = dot([word_model.output, context_model.output], axes=1,
                  normalize=False,name='dotproduct') 
dot_product = Dense(1,activation="sigmoid")(dot_product) 

# We use the functional API to bring all the above parts together
# to complete the word2vec architecture
model = Model(inputs=[word_model.input, context_model.input], 
              outputs=dot_product,name='SGNS') 
# We run the model summary to see the full architecture 
model.summary()

Model: "SGNS"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_input (InputLayer)    [(None, 1)]          0                                            
__________________________________________________________________________________________________
context_input (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 300)       3000        embedding_input[0][0]            
__________________________________________________________________________________________________
context (Embedding)             (None, 1, 300)       3000        context_input[0][0]              
_______________________________________________________________________________________________

### ⏸ What are the dimensions of the embedding matrix for the above model?


#### A. $(300,100)$
#### B. $(10,300)$
#### C. $(300,1)$
#### D. $(300,10)$

In [7]:
### edTest(test_chow2) ###
# Submit an answer choice as a string below (eg. if you choose option A, put 'A')
answer1 = 'B'

In [8]:
# We compile the model using binary crossentropy and rmsprop optimizer
model.compile(loss="binary_crossentropy", optimizer="rmsprop") 

In [9]:
# Lets choose a random training sample
idx = np.random.randint(0, len(labels)-1)

# Using the index we call the input values 
# NOTE: we process the input to comply with the model
# i.e changing dtype and shape
target_input = np.array(word_target[idx],dtype='float32').reshape(1,)
context_input = np.array(word_context[idx],dtype='float32').reshape(1,)
training_label = np.array(labels[idx],dtype='float32').reshape(1,)

# We use the tf.keras `model.train_on_batch` to train on a single batch
# for demonstration that our model works
loss = model.train_on_batch([target_input, context_input], training_label)
print(f'Loss after one epoch is {loss:.2f}')

Loss after one epoch is 0.62


### Rebuilding the SGNS word2vec architecture without the *embedding layer*

### ⏸ What does the `tf.keras.layers.Embedding` do?


#### A. It makes a dictionary with keys as input and values as weights
#### B. It converts an input of type `str` to type `float`
#### C. It one-hot encode the input and adds an appropriate dense layer
#### D. It creates $n$-dimension list for embedding size of $(n,)$

In [10]:
### edTest(test_chow3) ###
# Submit an answer choice as a string below (eg. if you choose option A, put 'A')
answer1 = 'C'

In [11]:
# We build the sub-model for target words
# As a dense layer on a one-hot encoded input without bias term
# remember that the dense layer will have activation 'linear'
# and number of neurons as the embedding dimension
word_model = Sequential() 
word_model.add(Input(shape=(1,vocab_size)))
word_model.add(Dense(vector_dim,activation='linear',use_bias=False))
word_model.add(Reshape((vector_dim, ))) 

# We build the same for the context words
context_model = Sequential() 
context_model.add(Input(shape=(1,vocab_size)))
context_model.add(Dense(300,activation='linear',use_bias=False))
context_model.add(Reshape((vector_dim, ))) 

# We use the `tf.keras.layers.dot` which returns the 
# dot product of two output vectors
dot_product = dot([word_model.output, context_model.output], axes=1,
                  normalize=False,name='dotproduct') 

# We also add a sigmoid to ensure the outputs are between 0 & 1
dot_product = Dense(1,activation="sigmoid")(dot_product)

# Similar to the model above we create our model with inputs
# from `word_model` and `context_model` and the output from 
# the `dot_product`
model = Model(inputs=[word_model.input, context_model.input], 
              outputs=dot_product,name='Custom') 

# Again we run the model summary to ensure we have built the
# word2vec architecture correctly
# Note that you must have 6,002 trainable parameters
model.summary()

Model: "Custom"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1, 10)]      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1, 10)]      0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1, 300)       3000        input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 1, 300)       3000        input_2[0][0]                    
_____________________________________________________________________________________________

In [12]:
# Again we compile the model using binary crossentropy and rmsprop optimizer
model.compile(loss="binary_crossentropy", optimizer="rmsprop") 

In [13]:
# Lets choose a random training sample
idx = np.random.randint(0, len(labels)-1)

# Using the index we call the input values 
# NOTE: This time, we will have to one-hot encode the input
# in order to comply with our new model
# Also, we will have to add one extra dimension to the input
# using np.expand_dims in order to avoid warnings from tf.keras API
onehot_target = np.expand_dims(to_categorical(word_target[idx],num_classes=vocab_size).reshape(1,-1),axis=0)
onehot_context = np.expand_dims(to_categorical(word_context[idx],num_classes=vocab_size).reshape(1,-1),axis=0)
training_label = np.array(labels[idx],dtype='float32').reshape(1,)

# We use the tf.keras `model.train_on_batch` to train on a single batch
# for demonstration that our model works
loss = model.train_on_batch([onehot_target, onehot_context], training_label)
print(f'Loss after one epoch is {loss:.2f}')

Loss after one epoch is 0.74


## Mindchow 🍲

What do the weights in the dense layer of the above model signify?

In [14]:
### edTest(test_chow4) ###
# Type your answer within in the quotes given
answer4 = 'Each column in the embeddings dense weight matrix is the $n$-dimensional representation for each word in the vocabulary'