In [1]:
documents = ["Ashraf Marwan is remembered most famously for spying for the Egyptian intelligence agency",
             "feeding Egypt strategic information on the location of Israeli military assets",
             "Marwans unparalleled access to his nations best kept secrets especially after his promotion"]

In [33]:
stop_words = ['is', 'of', 'the', 'for', 'to', 'on', 'his']

sentences = []
for text in documents:
    text = text.lower()
    tmp = text.split(" ")
    text_clean = [w for w in tmp if not w in stop_words]
    sentences.append(" ".join(text_clean))
    

words = []
for docs in sentences:
    tokenized = docs.split()
    for token_words in tokenized:
        if not token_words in stop_words:
            words.append(token_words)        

# unique words in corpus
unique_words = set(words)

In [34]:
sentences

['ashraf marwan remembered most famously spying egyptian intelligence agency',
 'feeding egypt strategic information location israeli military assets',
 'marwans unparalleled access nations best kept secrets especially after promotion']

In [36]:
unique_words

{'access',
 'after',
 'agency',
 'ashraf',
 'assets',
 'best',
 'egypt',
 'egyptian',
 'especially',
 'famously',
 'feeding',
 'information',
 'intelligence',
 'israeli',
 'kept',
 'location',
 'marwan',
 'marwans',
 'military',
 'most',
 'nations',
 'promotion',
 'remembered',
 'secrets',
 'spying',
 'strategic',
 'unparalleled'}

In [37]:
# assigning word int value
idx = 1
word_ids = {}
for idx, val in enumerate(unique_words):
    word_ids[val] = idx    # if not +1 we'll get first element be 0           

In [38]:
sentences

['ashraf marwan remembered most famously spying egyptian intelligence agency',
 'feeding egypt strategic information location israeli military assets',
 'marwans unparalleled access nations best kept secrets especially after promotion']

In [39]:
word_ids

{'feeding': 0,
 'marwan': 1,
 'location': 2,
 'information': 3,
 'spying': 4,
 'nations': 5,
 'marwans': 6,
 'best': 7,
 'israeli': 8,
 'especially': 9,
 'egypt': 10,
 'most': 11,
 'intelligence': 12,
 'secrets': 13,
 'egyptian': 14,
 'kept': 15,
 'access': 16,
 'unparalleled': 17,
 'ashraf': 18,
 'famously': 19,
 'remembered': 20,
 'strategic': 21,
 'assets': 22,
 'military': 23,
 'promotion': 24,
 'after': 25,
 'agency': 26}

In [40]:
# Getting labels/Neighbours using skip gram 
main_word = []
lb_word = []   # neighbour word

WINDOW_SIZE = 2
# print(sentences)

tok_sentences = []
for wrds in sentences:
    tok_sentences.append(wrds.split())

for sentence in tok_sentences:
    # print(sentence)
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] :    
            if nb_word != word:
                main_word.append(word)
                lb_word.append(nb_word)
                
                

In [None]:
list(zip(main_word, lb_word))

# creating one layer neural network 

In [51]:
import tensorflow as tf
import numpy as np

ONE_HOT_DIM = len(unique_words)

def get_one_hot(key):
    array = [0]*ONE_HOT_DIM
    array[word_ids[key]] = 1
    return array

# print(get_one_hot('egyptian'))

X = [] # input word
Y = [] # target word

for x,y  in zip(main_word, lb_word):
    X.append(get_one_hot(x))
    Y.append(get_one_hot(y))
    
X_train = np.asarray(X)
Y_train = np.asarray(Y)

In [55]:
# making placeholders for X_train and Y_train
x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))

# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 3 

# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([1])) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)

# output layer
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
b2 = tf.Variable(tf.random_normal([1]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

# training operation
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

In [56]:

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 20000
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 3000 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))

iteration 0 loss is :  5.1089883
iteration 3000 loss is :  1.8507527
iteration 6000 loss is :  1.5673333
iteration 9000 loss is :  1.5180882
iteration 12000 loss is :  1.4942526
iteration 15000 loss is :  1.4789653
iteration 18000 loss is :  1.4695277


In [57]:
# Now the hidden layer (W1 + b1) is actually the word look up table
vectors = sess.run(W1 + b1)
print(vectors)

[[-3.1443722   2.3578405  -1.7974826 ]
 [ 0.55547386  3.0314193  -2.6923094 ]
 [-2.5955384   2.1580765   1.7023705 ]
 [-1.0204542   0.53643495  0.27944535]
 [-1.0583526   0.7624183  -3.912158  ]
 [ 2.9373715  -1.7509509   2.3712595 ]
 [ 1.9486309  -2.5520415   2.0448358 ]
 [ 2.7526534  -1.7446601  -0.4242946 ]
 [-1.9991106  -0.21119475  1.9516578 ]
 [ 3.2658482   0.7630225   0.54451025]
 [-3.5113933   1.5029639   0.49950525]
 [ 2.2551913   0.22754255 -3.3623257 ]
 [ 0.3510653  -2.5054908  -3.4675937 ]
 [ 3.032895    0.17217594  2.499767  ]
 [ 0.2246389  -0.99584526 -1.7047981 ]
 [ 0.8602616  -0.1773966   0.6414633 ]
 [-0.03150687 -2.4772356   2.5864713 ]
 [ 0.39474475 -3.683622    0.16904785]
 [ 2.737373    1.9833935  -2.6902564 ]
 [ 0.12919845  0.35039297 -2.0579782 ]
 [ 0.32637963  1.3154236  -1.4105233 ]
 [-3.8903463  -0.5285146  -0.02569342]
 [-1.0314983   1.7134744   2.861213  ]
 [-2.2078302  -0.7023514   3.085438  ]
 [ 2.0321336   2.055433    1.7341663 ]
 [ 3.519062    1.5439998 