In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import collections
import math
import os
import random
import zipfile

In [4]:
from six.moves import urllib
from six.moves import xrange

In [5]:
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [6]:
print(np.__version__)
print(tf.__version__)

1.14.1
1.6.0


In [7]:
DOWNLOADED_FILENAME = 'SampleText.zip'

def maybe_download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    statinfo = os.stat(DOWNLOADED_FILENAME)
    if statinfo.st_size == expected_bytes:
        print('Found and verified file from this path:', url_path)
        print('Download file: ', DOWNLOADED_FILENAME)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify file from: ' + url_path + '. Can you get it to the browser?')

In [8]:
def read_words():
    with zipfile.ZipFile(DOWNLOADED_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
    return words

In [10]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILESIZE = 31344016
maybe_download(URL_PATH, FILESIZE)

Found and verified file from this path: http://mattmahoney.net/dc/text8.zip
Download file:  SampleText.zip


In [12]:
vocabulary = read_words()
len(vocabulary)

17005207

In [13]:
vocabulary[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [16]:
def build_dataset(words, n_words):
    word_counts = [['UNKNOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words - 1))
    
    dictionary = dict()
    for word, _ in word_counts:
        dictionary[word] = len(dictionary)
    word_indexes = list()
    
    unknown_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unknown_count += 1
        word_indexes.append(index)
        
    word_counts[0][1] = unknown_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return word_counts, word_indexes, dictionary, reversed_dictionary    

In [21]:
VOCABULARY_SIZE = 5000
word_counts, word_indexes, dictionary, reversed_dictionary = build_dataset(
vocabulary, VOCABULARY_SIZE)

In [22]:
word_counts[:10]

[['UNKNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [23]:
word_indexes[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [32]:
import random
for key in random.sample(list(dictionary), 10):
    print(key, ":", dictionary[key])

reached : 1215
roles : 2832
northeast : 3692
day : 137
leader : 663
genes : 3475
croatian : 4837
forth : 3011
turn : 870
georgia : 2490


In [35]:
for key in random.sample(list(reversed_dictionary), 10):
    print(key, ":", reversed_dictionary[key])

2670 : threat
1728 : truth
2784 : falls
96 : history
1492 : mentioned
3284 : welsh
1065 : lived
375 : possible
717 : concept
2382 : cabinet


In [36]:
del vocabulary

In [37]:
global_index = 0

In [40]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index
    
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1    #[skip_window input_word skip_window]
    
    buffer = collections.deque(maxlen = span)
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)

    for i in range(batch_size // num_skips):
        target = skip_window   #input word at the center of the buffer
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
                
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window]  #this is the input word
            labels[i * num_skips + j, 0] = buffer[target] #these are the context words
            
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
        
    
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)
    return batch, labels
        

In [41]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [42]:
batch

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156])

In [43]:
labels

array([[ 128],
       [  59],
       [ 156],
       [   6],
       [   2],
       [ 477],
       [ 128],
       [   0],
       [3134],
       [ 477]])

In [52]:
for i in range(10):
    print(reversed_dictionary[batch[i]], ":", reversed_dictionary[labels[i][0]])

of : early
of : used
abuse : against
abuse : a
first : of
first : class
used : early
used : UNKNOWN
against : abuse
against : class


In [53]:
#Reset the global index because we updated while testing the batch code 
global_index = 0 


In [54]:
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace = False)


In [55]:
batch_size = 128
embedding_size = 50
skip_window = 2
num_skips = 2

In [59]:
tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [60]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)


In [61]:
embeddings = tf.Variable(
tf.random_uniform([VOCABULARY_SIZE, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [62]:
embeddings

<tf.Variable 'Variable:0' shape=(5000, 50) dtype=float32_ref>

In [63]:
embed

<tf.Tensor 'embedding_lookup:0' shape=(128, 50) dtype=float32>

In [64]:
weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE, embedding_size], stddev = 1.0 / math.sqrt(embedding_size)))
biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

In [65]:
hidden_out

<tf.Tensor 'add:0' shape=(128, 5000) dtype=float32>

In [68]:
train_one_hot = tf.one_hot(train_labels, VOCABULARY_SIZE)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = hidden_out, labels = train_one_hot))


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [69]:
#optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

In [71]:
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims = True))
normalized_embeddings = embeddings / l2_norm


Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [73]:
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
valid_embeddings

<tf.Tensor 'embedding_lookup_2:0' shape=(16, 50) dtype=float32>

In [75]:
normalized_embeddings

<tf.Tensor 'truediv:0' shape=(5000, 50) dtype=float32>

In [78]:
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b = True)


In [79]:
init = tf.global_variables_initializer()

In [82]:
num_steps = 1000

In [89]:
with tf.Session() as session:
    init.run()
    
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(
        word_indexes, batch_size, num_skips, skip_window)
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict = feed_dict)
        average_loss += loss_val
        
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                
            print('Average loss at step', step, ':', average_loss)
            average_loss = 0
            
        if step % 1000 == 0:
            sim = similarity.eval()
            
            for i in range(valid_size):
                valid_word = reversed_dictionary[valid_examples[i]]
                top_k = 8 #nearest neighbours
                
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                
                for k in range(top_k):
                    close_word = reversed_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
            print("\n")

Average loss at step 0 : 8.64430046081543
Nearest to from: shared, infection, stage, called, manual, integration, underlying, occurring,
Nearest to been: hours, kind, ranging, giving, cited, growing, federal, alberta,
Nearest to had: fifth, costa, championship, contract, team, latvia, perspective, judgment,
Nearest to nine: indicated, economy, simultaneously, concerned, november, consequently, und, nelson,
Nearest to of: cia, netherlands, canada, fellow, plot, billy, headed, families,
Nearest to state: often, wars, measurement, electric, defining, explicit, younger, continue,
Nearest to united: australian, peoples, universe, aid, design, claim, characters, preparation,
Nearest to years: any, mhz, food, distinguished, equal, asia, similar, ministers,
Nearest to world: acceptable, followed, more, improved, aim, dragon, matters, about,
Nearest to system: fellow, mountain, autobiography, august, link, channel, perceived, bush,
Nearest to history: absence, unified, philip, candidates, manga