In [1]:
import numpy as np
import math
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
batch_size=64
embedding_dimension=5
negative_samples=8

In [3]:
digital_to_word_map={1:"One", 2:"Two", 3:"Three", 4:"Four", 5:"Five" ,6:"Six", 7:"Seven", 8:"Eight", 9:"Nine"}

In [4]:
sentences=[]
for i in range(10000):
    rand_odd_ints=np.random.choice(range(1,10,2),3)
    sentences.append(" ".join(digital_to_word_map[r] for r in rand_odd_ints))
    rand_even_ints=np.random.choice(range(2,10,2),3)
    sentences.append(" ".join(digital_to_word_map[r] for r in rand_even_ints))

In [5]:
sentences[:10]

['Five Seven One',
 'Two Two Two',
 'Nine Five Five',
 'Six Two Two',
 'Three Three Nine',
 'Four Two Eight',
 'Five One Nine',
 'Four Eight Six',
 'Five Three Nine',
 'Six Six Eight']

In [6]:
word2index_map={}
index=0
for sent in sentences:
    for word in sent.lower().split():
        #print(word)
        if word not in word2index_map:
            word2index_map[word]=index
            index+=1
#print(word2index_map)            
index2word_map={index:word for word,index in word2index_map.items()}
#print(index2word_map)
vocabulary_size=len(index2word_map)
#print(vocabulary_size)

In [7]:
for word,index in word2index_map.items():
    print(word,index)

five 0
seven 1
one 2
two 3
nine 4
six 5
three 6
four 7
eight 8


In [8]:
index2word_map

{0: 'five',
 1: 'seven',
 2: 'one',
 3: 'two',
 4: 'nine',
 5: 'six',
 6: 'three',
 7: 'four',
 8: 'eight'}

In [10]:
skip_gram_pairs=[]
for sent in sentences:
    tokenized_sent = sent.lower().split()
    #print(tokenized_sent)
    #print(len(tokenized_sent))
    for i in range(1, len(tokenized_sent)-1 ):
        #print(i)
        #print(tokenized_sent[i-1])
        #print(tokenized_sent[i+1])
        #print(tokenized_sent[i])
        word_contex_pair=[[word2index_map[tokenized_sent[i-1]],
                           word2index_map[tokenized_sent[i+1]]],
                           word2index_map[tokenized_sent[i]]]
        #print(word_contex_pair)
        skip_gram_pairs.append([word_contex_pair[1],
                                word_contex_pair[0][0]])
        skip_gram_pairs.append([word_contex_pair[1],
                                word_contex_pair[0][1]])
        #print(skip_gram_pairs)

In [11]:
def get_skipgram_batch(batch_size):
    instant_indices = list(range(len(skip_gram_pairs)))
    np.random.shuffle(instant_indices)
    batch=instant_indices[:batch_size]
    x=[skip_gram_pairs[i][0] for i in batch]
    y=[[skip_gram_pairs[i][1]] for i in batch]
    return x,y

In [12]:
x_batch,y_batch=get_skipgram_batch(8)

In [13]:
x_batch

[0, 8, 6, 3, 8, 6, 1, 3]

In [14]:
y_batch

[[0], [8], [1], [3], [5], [0], [0], [3]]

In [15]:
[index2word_map[word] for word in x_batch]

['five', 'eight', 'three', 'two', 'eight', 'three', 'seven', 'two']

In [16]:
[index2word_map[word[0]] for word in y_batch ] 

['five', 'eight', 'seven', 'two', 'six', 'five', 'five', 'two']

In [17]:
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])

In [18]:
train_labels=tf.placeholder(tf.int32, shape=[batch_size,1])

In [19]:
embedding=tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_dimension],-1.0,1.0)
    ,name='embedding'
)

In [20]:
embed=tf.nn.embedding_lookup(embedding, train_inputs)

In [21]:
nce_weights=tf.Variable(tf.truncated_normal([vocabulary_size, embedding_dimension],
                                           stddev=1.0/math.sqrt(embedding_dimension)))

In [22]:
nce_biases=tf.Variable(tf.zeros([vocabulary_size]))

In [23]:
loss=tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, inputs=embed,
                  labels=train_labels, num_sampled=negative_samples, 
                  num_classes=vocabulary_size)
)

In [24]:
global_step=tf.Variable(0, trainable=False)
learningRate=tf.train.exponential_decay(learning_rate=0.1,
                                       global_step=global_step,
                                       decay_steps=1000,
                                       decay_rate=0.95,
                                       staircase=True)

In [25]:
train_step=tf.train.GradientDescentOptimizer(learningRate).minimize(loss)

In [26]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    for step in range(1000):
        x_batch,y_batch=get_skipgram_batch(batch_size)
        sess.run(train_step,feed_dict={train_inputs:x_batch,train_labels:y_batch})
        if step % 100 ==0:
            loss_val=sess.run(loss,feed_dict={train_inputs:x_batch,train_labels:y_batch})
            print(loss_val)
    norm=tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keep_dims=True))
    normalized_embedding = embedding/norm
    normalized_embedding_matrix=sess.run(normalized_embedding)

8.126676
3.0278141
2.8113775
2.7104216
2.5553653
2.5925653
2.5197506
2.4882736
2.5662475
2.5807066
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [27]:
normalized_embedding_matrix

array([[ 0.423377  , -0.20916885,  0.48683763,  0.73342395, -0.04559429],
       [ 0.43623137, -0.43442598,  0.44560573,  0.6186341 ,  0.1992579 ],
       [ 0.42939547, -0.28662187,  0.4637446 ,  0.7196009 , -0.02414428],
       [-0.7964664 ,  0.00267597,  0.29750478,  0.04865813, -0.5241732 ],
       [ 0.17179987, -0.08152475,  0.6624316 ,  0.28748056,  0.6651147 ],
       [-0.528824  , -0.63828695,  0.10252123, -0.13135803, -0.53401285],
       [ 0.17590399,  0.06784698,  0.41286522,  0.86611265,  0.20939384],
       [-0.63646066, -0.4526813 ,  0.4419637 , -0.43826264,  0.05090633],
       [-0.87003005,  0.07162157,  0.45384517, -0.15144064, -0.09491292]],
      dtype=float32)

In [28]:
word2index_map["one"]

2

In [29]:
ref_word=normalized_embedding_matrix[word2index_map["one"]]

In [30]:
ref_word

array([ 0.42939547, -0.28662187,  0.4637446 ,  0.7196009 , -0.02414428],
      dtype=float32)

In [31]:
ref_word.shape

(5,)

In [32]:
normalized_embedding_matrix.shape

(9, 5)

In [33]:
cosine_digits = np.dot(normalized_embedding_matrix, ref_word)

In [34]:
cosine_digits 

array([ 0.9963902 ,  0.95883775,  1.        , -0.15712959,  0.59514856,
       -0.07821596,  0.8657497 , -0.25518996, -0.29033226], dtype=float32)

In [35]:
cosine_digits.shape

(9,)

In [36]:
ff=np.argsort(cosine_digits )[::-1][1:10]

In [37]:
ff

array([0, 1, 6, 4, 5, 3, 7, 8])

In [38]:
index2word_map

{0: 'five',
 1: 'seven',
 2: 'one',
 3: 'two',
 4: 'nine',
 5: 'six',
 6: 'three',
 7: 'four',
 8: 'eight'}

In [39]:
for f in ff:
    print(index2word_map[f])
    print(cosine_digits[f])
    print('  ')

five
0.9963902
  
seven
0.95883775
  
three
0.8657497
  
nine
0.59514856
  
six
-0.078215964
  
two
-0.15712959
  
four
-0.25518996
  
eight
-0.29033226
  
