In [1]:
%tensorflow_version 2.x


Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


# Import Packages

In [2]:
import numpy as np
import keras.backend as K
import tensorflow as tf
import operator
from tensorflow import keras
# from keras.utils import np_utils # This is no longer needed
from tensorflow.keras.utils import to_categorical # Use this for one-hot encoding

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
# from tensorflow.keras.utils import to_categorical # Already imported above
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
import pandas as pd

# Mount Drive

Data set can be download from [here](https://drive.google.com/file/d/1tFhlcibsLZKbsxze_pXHCYeFnkQdBXhW/view?usp=sharing)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/My Drive/Dataset/'


/content/drive/My Drive/Dataset


In [None]:
ls

alice.txt                           vectors_cbow_150.txt
GoogleNews-vectors-negative300.bin  vectors_cbow_32.txt
Sentiment.csv                       vectors_skipgram_150.txt
spam.csv                            vectors_skipgram_32.txt
spam.xlsx


In [3]:
file_name = '/content/alice.txt'
corpus = open(file_name).readlines()

# Preprocessing

In [4]:
# Remove sentences with fewer than 3 words
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)

# Convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

n_samples, V


(27165, 2557)

In [5]:
#Integer mapping
print(list((tokenizer.word_index.items()))[:5])


[('the', 1), ('and', 2), ('to', 3), ('a', 4), ('it', 5)]


In [6]:
# Parameters
window_size = 2
#window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)

**Skip gram model**

In [7]:
# Prepare data for the skipgram model
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size * 2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1

            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    # Add the input word
                    all_in.append(word)
                    # Add one-hot of the context words
                    all_out.append(to_categorical(words[i], V))

    return (np.array(all_in), np.array(all_out))

In [8]:
X_skip, y_skip = generate_data_skipgram(corpus, window_size, V)
X_skip.shape, y_skip.shape


((94556,), (94556, 2557))

In [9]:
X_skip[1],y_skip[1]

(np.int64(305), array([0., 0., 0., ..., 0., 0., 0.]))

In [10]:
skipgram = Sequential()
dim =32
# Add an Embedding layer
skipgram.add(Embedding(input_dim=V,
                       output_dim=dim,
                       input_length=1,
                       embeddings_initializer='glorot_uniform'))

# Add a Reshape layer, which reshapes the output of the embedding layer (1,dim) to (dim,)
skipgram.add(Reshape((dim, )))

# Add a final Dense layer with the same size as in [1]
skipgram.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

# Compile the model with a suitable loss function and select an optimizer.
# Optimizer Adagrad was used in paper
skipgram.compile(optimizer=keras.optimizers.Adam(),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

skipgram.build(input_shape=(None, 1))
skipgram.summary()
print("")
#skipgram_models.append(skipgram)






In [11]:
skipgram.fit(X_skip, y_skip, batch_size=64, epochs=3, verbose=1)
print("")

Epoch 1/3
[1m1478/1478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.0540 - loss: 7.0012
Epoch 2/3
[1m1478/1478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.0629 - loss: 5.9558
Epoch 3/3
[1m1478/1478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.0677 - loss: 5.8394



In [12]:
weights = skipgram.get_weights()
len(weights)

3

In [13]:
embedding = weights[0]

In [14]:
embedding.shape

(2557, 32)

In [15]:
weights[2]

array([-0.9326653 ,  0.9412425 ,  1.0245277 , ..., -0.41742665,
       -0.47592288, -0.211484  ], dtype=float32)

In [16]:
f = open(f"vectors_skipgram_{len(embedding[0])}.txt", "w")
columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]
f.write(" ".join(columns))
f.write("\n")

for word, i in tokenizer.word_index.items():
  f.write(word)
  f.write(" ")
  f.write(" ".join(map(str, list(embedding[i,:]))))
  f.write("\n")
f.close()

**CBOW**

In [31]:
def generate_data_cbow(corpus, window_size, V):
  maxlen = window_size * 2
  all_in = []
  all_out = []
  for words in corpus:
      L = len(words)
      for index, word in enumerate(words):
          p = index - window_size
          n = index + window_size + 1

          context_words = []
          for i in range(p, n):
              if i != index and 0 <= i < L:
                  context_words.append(words[i])

          if context_words:
              all_out.append(to_categorical(word, V))
              all_in.append(context_words)

  all_in = sequence.pad_sequences(all_in, maxlen=maxlen, padding='post', value=0)

  return (np.array(all_in), np.array(all_out))

In [32]:
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
X_cbow.shape, y_cbow.shape

cbow = Sequential()
dim =32
cbow.add(Embedding(input_dim=V,
                       output_dim=dim,
                       input_length=window_size * 2,
                       embeddings_initializer='glorot_uniform'))

cbow.add(Lambda(lambda x: tf.reduce_sum(x, axis=1), output_shape=(dim,)))

cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

In [33]:
cbow.compile(optimizer=keras.optimizers.Adam(),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

In [34]:
cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=4, verbose=1)
print("")

Epoch 1/4
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0493 - loss: 7.1170
Epoch 2/4
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0626 - loss: 5.9788
Epoch 3/4
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.0700 - loss: 5.7778
Epoch 4/4
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0911 - loss: 5.5621



In [35]:
weights = cbow.get_weights()
embedding = weights[0]


In [36]:
f = open(f"vectors_cbow_{len(embedding[0])}.txt", "w")
columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]
f.write(" ".join(columns))
f.write("\n")

for word, i in tokenizer.word_index.items():
  f.write(word)
  f.write(" ")
  f.write(" ".join(map(str, list(embedding[i,:]))))
  f.write("\n")
f.close()

**Analogy Computation**

In [37]:
def embed(word, embedding, vocab_size=V, tokenizer=tokenizer):
    """ Embed a word by getting the one hot encoding and taking the dot product of this vector with the
        embedding matrix 'word' = string type
    """
    # get the index of the word from the tokenizer, i.e. convert the string to it's corresponding integer in the vocabulary
    int_word = tokenizer.texts_to_sequences([word])[0]
    # get the one-hot encoding of the word
    bin_word = to_categorical(int_word, V)
    return np.dot(bin_word, embedding)

In [38]:
def compute_distance(word_a, word_b, word_c, word_d):
    """ Returns the cosine distance between the predicted and the true word (word_d)

    Our analogy function is: 'word_a is to word_b as word_c is to ?'
    Here, ? is predicted based on the embeddings. Then, we compare ? to word_d, which is the true word.
    """
    models = [skipgram,cbow]
    embeddings = [model.get_weights()[0] for model in models]
    for embedding in embeddings:
        predicted_embedding = embed(word_b, embedding) - embed(word_a, embedding) + embed(word_c, embedding)
        dist_exp_true = cosine_distances(predicted_embedding, embed(word_d, embedding))
        print(dist_exp_true[0][0])

In [39]:
compute_distance('king', 'queen', 'woman', 'man')

0.14948418400995245
0.5965064309009027


In [40]:
from scipy.spatial.distance import cosine, cdist


In [41]:
def get_nearest_words(model_name, embed_word, used_words, nr=10):
    """Returns the `nr` nearest words to the `embed_word` for a certain `model_name`
    """
    # Load the model embedding matrix and create a list of all the words
    df = pd.read_csv(f"vectors_{model_name}.txt", sep=" ")

    # Filter out words that are in the analogy
    df = df[~(df["word"].isin(used_words))]

    # Store the embedded representation of the words
    embedded_words = df.iloc[:, 1:].values
    embedded_word = embed_word.reshape(1, -1)

    # Get the distances between the input embedding and the embedded words such that we can look for the smallest one
    # cdist makes it easy for us to compute the cosine distance between each pair of the two collections of inputs
    distances = cdist(embedded_word, embedded_words, "cosine").reshape(-1)

    # Sort distances and store the indices of the `nr` lowest distances
    top_sorted_indices = distances.argsort()[:nr]

    # Convert the indices to actual words
    top_words = [list(df["word"])[i] for i in top_sorted_indices]

    # Keep the rounded values of those indices
    values = [round(distances[i], 4) for i in top_sorted_indices]
    # Concatenate the top words together with their values and return it as a list
    return list(zip(top_words, values))


In [44]:
def print_analogy(analogy, embeddings, models, model_names, nr=10):
    # Retrieve the words from the analogy we need to compute
    word_a, word_b, word_c, word_true = analogy

    # Formulate the analogy task
    analogy_task = f"{word_a} is to {word_b} as {word_c} is to ?"

    print(f"Analogy Task: {analogy_task}")
    print("---------------------------------------------------")

    # Iterate over all models available
    for model_name, embedding in zip(model_names, embeddings):
        # Obtain embeddings for all the words
        embed_true = embed(word_true, embedding).flatten() # Flatten the embedding
        embed_a, embed_b, embed_c = embed(word_a, embedding).flatten(), embed(word_b, embedding).flatten(), embed(word_c, embedding).flatten() # Flatten the embeddings

        # Obtain the predicted embedding based on the analogy function
        embed_prediction = embed_b - embed_a + embed_c

        # The true word with distance similarity value between predicted embedding and true word embedding,
        # also denoted `sim1` in the text above
        sim1 = round(cosine(embed_true, embed_prediction), 4)

        # The predicted word with distance similarity value between predicted embedding and the embedding of the word
        # in the vocabulary that is closest to this predicted embedding
        word_prediction, sim2 = get_nearest_words(model_name, embed_prediction, [word_a, word_b, word_c], 1)[0]

        # Get the top `nr` nearest words
        nearest_words = get_nearest_words(model_name, embed_prediction, [word_a, word_b, word_c], nr)

        # Print whether or not the true word was in the top nr
        partially_correct = word_true in [word[0] for word in nearest_words]


        print(f"Embedding: {model_name}")
        # Print all top nr words with their distance
        for word in nearest_words:
            print(f"{word[0]} => {round(word[1], 4)}")
        print(f"Predicted: {word_prediction} ({round(sim2, 4)}) - True: {word_true} ({sim1})")
        print(f"Correct? {word_prediction == word_true} - In the top {nr}? {partially_correct}")
        print("----------")

In [45]:
analogy = ('king', 'queen', 'woman', 'man')

models = [skipgram, cbow]
embeddings = [model.get_weights()[0] for model in models]
model_names = ['skipgram_32', 'cbow_32']

print_analogy(analogy, embeddings, models, model_names, nr=10)

Analogy Task: king is to queen as woman is to ?
---------------------------------------------------
Embedding: skipgram_32
writing => 0.0648
till => 0.0677
high => 0.0701
waited => 0.0767
promising => 0.0804
wide => 0.0807
tucked => 0.081
smile => 0.081
bottle => 0.0825
left => 0.083
Predicted: writing (0.0648) - True: man (0.1495)
Correct? False - In the top 10? False
----------
Embedding: cbow_32
for => 0.2721
knowing => 0.2907
explanations => 0.2986
kept => 0.3038
tureen => 0.3106
cool => 0.3107
jaws => 0.3115
accounts => 0.3129
sugar => 0.3162
instance => 0.3185
Predicted: for (0.2721) - True: man (0.5965)
Correct? False - In the top 10? False
----------
