**Problem statement:** 

Continuous bag of words (cbow) word2vec word embedding work is that it tends to predict the probability of a word given a context. A context may be a single word or a group of words. But for simplicity, I will take a single context word and try to predict a single target word. 

The purpose of this assignment is to be able to create a word embedding for the  given data set.  

**Data set :** w2v.txt 

### Importing libraries

In [293]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Embedding,Dense
from scipy import spatial

### Function Definations

In [294]:
# Finding the maximum length of an array required to include all sentences. Vocabulary size, in order to create the 
# same size inputs by padding with zeros

def get_max_vocabulary_size(encoded_line_words):
    max_voc_size = 0
    max_integer_index = 0
    for l in encoded_line_words:
        if (len(l) > 0) and (np.argmax(l) > max_integer_index):
            max_integer_index = l[np.argmax(l)]
        if max_voc_size < len(l):
            max_voc_size = len(l)
    return max_voc_size

In [295]:
# Creating a new list with the targets for the words with window size as window_size. 
# Only positive window size is taken into consideration

def map_target(context, window_size):
    if window_size < 0:
        print("Only Positive window sie is expected.")
        return;
    target = context[window_size:]
    m = 10
    i = 0
    while i < window_size :
        target = np.append(target, [0])
        i = i + 1
    return np.array(target)

In [296]:
# Creating array of targets using the map_target function

def map_all_targets(context_list, window_size):
    targets = []
    for context in context_list:
        target_row = map_target(context, window_size)
        targets.append(np.array(target_row))
    return targets

In [297]:
def pad_sequences_words(file_lines_words, maxlen):
    padded_word = []
    for line_word in file_lines_words:
        blank_word_index = len(line_word)
        while blank_word_index < maxlen:
            line_word.append("")
            padded_word.append(line_word)
            blank_word_index = blank_word_index + 1

In [298]:
import re
def corpus_to_vocab(txt_file):
    corpus = ''
    with open(txt_file, 'r', encoding="utf-8") as file:
        try:
            for line in file:
                try:
                    line = line.strip()
                    if line!="":
                        corpus = corpus + ' ' + line
                except Exception as E:
                    print ("got An exception 2: ", E)
                    pass         
        except Exception as E:
            print ("got An exception 3: ", E)
            pass
        corpus = re.sub('[\.\,\"\'\(\)\n\s]+', ' ', corpus.strip().lower())
        return corpus.split(' ')

In [299]:
def unique_words_list(word_list):
    unique_words = []
    for w in word_list:
        if (w not in unique_words) and len(w)>2:
            unique_words.append(w)
    return unique_words

In [300]:
def word_OHE(vocabulary):
    OHE_word = [one_hot(wd.strip(),len(vocabulary))[0] for wd in vocabulary]
    OHE_word_pair = {wd.strip():one_hot(wd.strip(),len(vocabulary))[0] for wd in vocabulary}
    Word_OHE_pair = {one_hot(wd.strip(),len(vocabulary))[0]:wd.strip() for wd in vocabulary}
    return {'Word':vocabulary, 'OHE': OHE_word, 'Word_OHE':OHE_word_pair, 'OHE_Word': Word_OHE_pair}

In [301]:
w_list = corpus_to_vocab("w2v.txt")

In [302]:
vocabulary = unique_words_list(w_list)

In [303]:
dictionary = word_OHE(vocabulary)

In [304]:
targets = [target for target in dictionary['OHE']]

In [305]:
dictionary['OHE'][0]

72

In [306]:
index = 0
targets = []
for target in dictionary['OHE']:
    if index < len(dictionary['OHE'])-1:
        targets.append(dictionary['OHE'][index+1])
    index = index + 1
targets.append(0)

In [307]:
len(vocabulary)

255

In [308]:
len(targets)

255

In [309]:
context = np.array(dictionary['OHE'])

In [310]:
targets = np.array(targets)

In [311]:
# All the parameters calculation and constants are being set using this function. 
# Any changes to the parameters should be mafe here to avoid execution errors.

def parameters(context):
    reshaped_context = context.reshape(-1)
    INPUT_DIM = max(reshaped_context)+1
    OUTPUT_DIM = 8
    INPUT_LENGTH = len(context)
    EPOCHS = 1000
    VERBOSE = 1
    LOSS = 'categorical_crossentropy'
    ACTIVATION = 'softmax'
    OPTIMIZER = 'adam'
    MATRIX = ['accuracy']
    BESTMODEL = 'embeddings.h5'
    return (INPUT_DIM, OUTPUT_DIM, INPUT_LENGTH, EPOCHS, VERBOSE, LOSS, ACTIVATION, OPTIMIZER, MATRIX, BESTMODEL)

In [312]:
context

array([ 72, 153,  28, 110, 212, 107, 224, 145, 190, 100,  42, 181, 197,
       174, 194, 129,  79,  13, 192, 242, 185, 140,  39, 241,  93, 203,
       234,  13, 105, 102, 104, 222,  50, 207, 210, 196,   6, 160,  50,
       173, 228,  56, 196,  69, 246,  75,  49, 137, 152,  78, 173, 101,
       169,  31,  10,   2, 108,  62,  84,  53, 115,  64,  41,  42, 132,
       226, 176, 232, 240,  87, 129, 126, 221, 156, 172, 114, 207, 236,
         2, 179, 220,  55, 254, 172, 238,  69, 197, 127, 186,  62, 160,
       163, 166, 116, 149,  75,  20,  85, 215, 167,  18, 171, 243,  35,
        87, 252, 141, 170, 253, 254,  83, 191, 142, 246, 136, 240, 153,
       127, 189, 171,  34, 106,  45,  92,  39, 171, 116, 250, 157, 208,
        23,  71, 152, 146, 122,  94, 108,  55, 136, 100,  93, 232,  92,
        11,  73, 152,  40, 129, 159,  34, 190, 164, 253,  99,  61,  84,
       191,  84, 210, 129, 101, 227,  79, 239, 204,   1, 210, 193, 129,
       165,  45,   4,  20, 118, 215,  84, 118, 118, 202, 247, 20

In [313]:
# Confirming that the shapes of context and targets are same

context.shape, targets.shape

((255,), (255,))

In [314]:
context = context.reshape(-1, 255)
targets = targets.reshape(-1, 255)

In [315]:
context.shape, targets.shape

((1, 255), (255,))

In [316]:
# Unit testing the context and it's relavent target. Change the index between 0 to 12 to check the pair

index = 0
context[index], targets[index]

(array([ 72, 153,  28, 110, 212, 107, 224, 145, 190, 100,  42, 181, 197,
        174, 194, 129,  79,  13, 192, 242, 185, 140,  39, 241,  93, 203,
        234,  13, 105, 102, 104, 222,  50, 207, 210, 196,   6, 160,  50,
        173, 228,  56, 196,  69, 246,  75,  49, 137, 152,  78, 173, 101,
        169,  31,  10,   2, 108,  62,  84,  53, 115,  64,  41,  42, 132,
        226, 176, 232, 240,  87, 129, 126, 221, 156, 172, 114, 207, 236,
          2, 179, 220,  55, 254, 172, 238,  69, 197, 127, 186,  62, 160,
        163, 166, 116, 149,  75,  20,  85, 215, 167,  18, 171, 243,  35,
         87, 252, 141, 170, 253, 254,  83, 191, 142, 246, 136, 240, 153,
        127, 189, 171,  34, 106,  45,  92,  39, 171, 116, 250, 157, 208,
         23,  71, 152, 146, 122,  94, 108,  55, 136, 100,  93, 232,  92,
         11,  73, 152,  40, 129, 159,  34, 190, 164, 253,  99,  61,  84,
        191,  84, 210, 129, 101, 227,  79, 239, 204,   1, 210, 193, 129,
        165,  45,   4,  20, 118, 215,  84, 118, 118

In [317]:
INPUT_DIM, OUTPUT_DIM, INPUT_LENGTH, EPOCHS, VERBOSE, LOSS, ACTIVATION, OPTIMIZER, MATRIX, BESTMODEL = \
parameters(context)

In [318]:
# Calback array to save the best model. Best model is decided on the basis of accuracy.

callback_list = [tf.keras.callbacks.ModelCheckpoint(filepath=BESTMODEL, monitor = MATRIX[0], 
                                                            save_best_only=True)]

<IPython.core.display.Javascript object>

In [319]:
INPUT_DIM

255

In [320]:
# Creating the sequential model with Embedding layer with output dimention of 8
# The weights of the first layers will be used as the word embedding
# these weights will later be used to calculate the distance between the words.
# least distant words are most related and vice versa

model = Sequential()
embedding_layer = Embedding(input_dim=INPUT_DIM, output_dim=OUTPUT_DIM, input_length=255)
model.add(embedding_layer)
model.add(Flatten())

# A softmax activation is used.

model.add(Dense(255, activation = ACTIVATION))

model.compile(optimizer = OPTIMIZER, loss=LOSS, metrics = MATRIX)

In [321]:
# Summary of the model 

print(model.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 255, 8)            2040      
_________________________________________________________________
flatten_8 (Flatten)          (None, 2040)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 255)               520455    
Total params: 522,495
Trainable params: 522,495
Non-trainable params: 0
_________________________________________________________________
None


In [322]:
context

array([[ 72, 153,  28, 110, 212, 107, 224, 145, 190, 100,  42, 181, 197,
        174, 194, 129,  79,  13, 192, 242, 185, 140,  39, 241,  93, 203,
        234,  13, 105, 102, 104, 222,  50, 207, 210, 196,   6, 160,  50,
        173, 228,  56, 196,  69, 246,  75,  49, 137, 152,  78, 173, 101,
        169,  31,  10,   2, 108,  62,  84,  53, 115,  64,  41,  42, 132,
        226, 176, 232, 240,  87, 129, 126, 221, 156, 172, 114, 207, 236,
          2, 179, 220,  55, 254, 172, 238,  69, 197, 127, 186,  62, 160,
        163, 166, 116, 149,  75,  20,  85, 215, 167,  18, 171, 243,  35,
         87, 252, 141, 170, 253, 254,  83, 191, 142, 246, 136, 240, 153,
        127, 189, 171,  34, 106,  45,  92,  39, 171, 116, 250, 157, 208,
         23,  71, 152, 146, 122,  94, 108,  55, 136, 100,  93, 232,  92,
         11,  73, 152,  40, 129, 159,  34, 190, 164, 253,  99,  61,  84,
        191,  84, 210, 129, 101, 227,  79, 239, 204,   1, 210, 193, 129,
        165,  45,   4,  20, 118, 215,  84, 118, 118

In [323]:
targets

array([153,  28, 110, 212, 107, 224, 145, 190, 100,  42, 181, 197, 174,
       194, 129,  79,  13, 192, 242, 185, 140,  39, 241,  93, 203, 234,
        13, 105, 102, 104, 222,  50, 207, 210, 196,   6, 160,  50, 173,
       228,  56, 196,  69, 246,  75,  49, 137, 152,  78, 173, 101, 169,
        31,  10,   2, 108,  62,  84,  53, 115,  64,  41,  42, 132, 226,
       176, 232, 240,  87, 129, 126, 221, 156, 172, 114, 207, 236,   2,
       179, 220,  55, 254, 172, 238,  69, 197, 127, 186,  62, 160, 163,
       166, 116, 149,  75,  20,  85, 215, 167,  18, 171, 243,  35,  87,
       252, 141, 170, 253, 254,  83, 191, 142, 246, 136, 240, 153, 127,
       189, 171,  34, 106,  45,  92,  39, 171, 116, 250, 157, 208,  23,
        71, 152, 146, 122,  94, 108,  55, 136, 100,  93, 232,  92,  11,
        73, 152,  40, 129, 159,  34, 190, 164, 253,  99,  61,  84, 191,
        84, 210, 129, 101, 227,  79, 239, 204,   1, 210, 193, 129, 165,
        45,   4,  20, 118, 215,  84, 118, 118, 202, 247, 200, 10

In [324]:
# Training cell. History variable contains all the loss and accuracy at each step.

history = model.fit(context, targets, epochs = EPOCHS, verbose=VERBOSE, callbacks=callback_list)

ValueError: Data cardinality is ambiguous:
  x sizes: 1
  y sizes: 255
Make sure all arrays contain the same number of samples.

In [325]:
# The variable word_embeddings contains all the embeddings

word_embeddings = embedding_layer.get_weights()[0]

In [173]:
word_embeddings[0]

array([-1.6219815,  1.6223835,  1.6204896,  1.614917 ,  1.617617 ,
       -1.619683 ,  1.6071986, -1.6152487], dtype=float32)

In [56]:
words = []
for line in file_details['line_words']:
    for wd in line:
        words.append(wd)

In [64]:
file_details['padded_context'][0]

array([49,  7, 24, 11, 25, 37, 49, 24, 35, 28, 33, 35,  8,  4, 23, 21, 22,
       45, 35, 51, 48,  6, 27, 42, 49, 13, 39, 35, 11, 49, 13, 28, 33, 10,
       51, 24, 47, 16,  3, 29, 24, 26, 36,  1, 24,  3,  4, 40, 20,  8,  1,
       37,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [57]:
words

['In',
 'linguistics',
 'word',
 'embeddings',
 'were',
 'discussed',
 'in',
 'the',
 'research',
 'area',
 'of',
 'distributional',
 'semantics.',
 'It',
 'aims',
 'to',
 'quantify',
 'and',
 'categorize',
 'semantic',
 'similarities',
 'between',
 'linguistic',
 'items',
 'based',
 'on',
 'their',
 'distributional',
 'properties',
 'in',
 'large',
 'samples',
 'of',
 'language',
 'data.',
 'The',
 'underlying',
 'idea',
 'that',
 '"a',
 'word',
 'is',
 'characterized',
 'by',
 'the',
 'company',
 'it',
 'keeps"',
 'was',
 'popularized',
 'by',
 'Firth.',
 'The',
 'technique',
 'of',
 'representing',
 'words',
 'as',
 'vectors',
 'has',
 'roots',
 'in',
 'the',
 '1960s',
 'with',
 'the',
 'development',
 'of',
 'the',
 'vector',
 'space',
 'model',
 'for',
 'information',
 'retrieval.',
 'Reducing',
 'the',
 'number',
 'of',
 'dimensions',
 'using',
 'singular',
 'value',
 'decomposition',
 'then',
 'led',
 'to',
 'the',
 'introduction',
 'of',
 'latent',
 'semantic',
 'analysis',
 'i

In [148]:
# Get the searched line and word indeces
def get_indices(word):
    line_index = 0
    word_index = 0
    for line in file_details['line_words']:
        for wrd in line:
            if wrd==word:
                return (line_index, word_index)
            word_index = word_index + 1
        line_index = line_index + 1
    return (-1,-1)

In [149]:
# Fetch the word embeddings

def fetch_embedding(line_index, word_index, word_embeddings):
    return word_embeddings[line_index]

In [165]:
# Function to find the least distant word with the help of word_embeddings. 
# given a word and the learned word_embeddings, this function will return the next (closest) word.
# evaluation_method can be cosine or eculedian 

def closest_word(word, word_embeddings, evaluation_method):
    line_index, word_index = get_indices(word)
    print(word_embeddings.shape)
    if (line_index == -1) or (word_index == -1):
        print("searched word not found in the given context!")
    else:
        embedding = fetch_embedding(line_index, word_index, word_embeddings)
        for embd in word_embeddings:
            result = 1 - spatial.distance.cosine(embedding, embd)
            print(result, line_index, word_index)
    return "word"

In [166]:
closest_word("2000", word_embeddings, 'cosine')

(178, 8)
0.4817196726799011 1 223
1 1 223
-0.019669702276587486 1 223
0.4816100597381592 1 223
-0.2703162133693695 1 223
0.9984984993934631 1 223
0.25284287333488464 1 223
0.007079429924488068 1 223
0.024043571203947067 1 223
-0.2701837718486786 1 223
-0.026256121695041656 1 223
-0.041422128677368164 1 223
0.4505995213985443 1 223
-0.728593647480011 1 223
0.48598986864089966 1 223
0.2297157496213913 1 223
-0.2341039627790451 1 223
0.273592084646225 1 223
0.24298851191997528 1 223
-0.03774547576904297 1 223
0.23756183683872223 1 223
0.47415846586227417 1 223
0.46532416343688965 1 223
0.5141133666038513 1 223
0.47594621777534485 1 223
-0.4893186092376709 1 223
0.7397944331169128 1 223
0.5194547772407532 1 223
0.2529175579547882 1 223
0.2443699836730957 1 223
-0.4822525382041931 1 223
0.27850496768951416 1 223
0.5336160659790039 1 223
0.47386428713798523 1 223
0.7442044019699097 1 223
-0.026949871331453323 1 223
-0.02250630594789982 1 223
0.6057190895080566 1 223
0.22758036851882935 1 223

'word'

In [174]:
word_embeddings

array([[-1.6219815,  1.6223835,  1.6204896, ..., -1.619683 ,  1.6071986,
        -1.6152487],
       [-1.6241891, -1.6635616,  1.6017293, ..., -1.5814848,  1.3999026,
         1.6283284],
       [ 1.5662514, -1.6116898, -1.5478582, ..., -1.5593542,  1.4303602,
        -1.6137869],
       ...,
       [-1.6226442,  1.6031357, -1.6311088, ...,  1.6225965, -1.5842654,
        -1.6153655],
       [-1.5912836,  1.6120912,  1.6211563, ..., -1.6162713, -1.586357 ,
         1.5927837],
       [-1.6265788, -1.6375501,  1.6025525, ...,  1.5824865, -1.5684392,
         1.622879 ]], dtype=float32)

In [147]:
file_details['line_words'][0][177]

''

In [23]:
file_details['padded_context'][0][0]

49

In [28]:
file_details

{'lines': ['In linguistics word embeddings were discussed in the research area of distributional semantics. It aims to quantify and categorize semantic similarities between linguistic items based on their distributional properties in large samples of language data. The underlying idea that "a word is characterized by the company it keeps" was popularized by Firth.',
  'The technique of representing words as vectors has roots in the 1960s with the development of the vector space model for information retrieval. Reducing the number of dimensions using singular value decomposition then led to the introduction of latent semantic analysis in the late 1980s.In 2000 Bengio et al. provided in a series of papers the "Neural probabilistic language models" to reduce the high dimensionality of words representations in contexts by "learning a distributed representation for words". (Bengio et al, 2003). Word embeddings come in two different styles, one in which words are expressed as vectors of co-o

In [None]:
model.predict()

In [None]:
cont = []
targ = []
for ctx in context:
    for cx in ctx:
        cont.append(cx)
        targ.append(np.argmax(pred[cx]))
df = pd.DataFrame({'Word':cont, 'Next Predicted Word':targ})

In [None]:
df