#Deeplearning
Assignment5:
The goal of this assignment is to train a Word2Vec skip-gram model over Text8 data

In [2]:
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [3]:
url = 'http://mattmahoney.net/dc/'
last_percent_reported = None
def download_progress_hook(count,blockSize,totalSize):
    global las_percent_reported
    percent = int(count * blockSize * 100 / totalSize)
    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()
    last_percent_reported = percent
    
def maybe_download(filename,expected_bytes,force=False):
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename)
        filename, _ = urlretrieve(url + filename, filename, 
                                  reporthook=download_progress_hook)
        print('\nDownload Complete!')
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
          'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [20]:
"""
read data into a string
"""
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print('Data size = {0}'.format(len(words)))
print('Sample words :',words[:20])

Data size = 17005207
Sample words : ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english']


In [22]:
# Build the dictionary and replace rare words with UNK token
# 创建字典（从dataset中找出最长出现的vocabulary-1个词，不常见的词则中UNK代替）
# data中记录的是原来文档的词的索引

vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK',-1]]
    # count中按照词出现的次数从高到低保存了前vocabulary_size的词和对应的出现次数
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word,_ in count:
        # dictionary在次过程中len(dictionary)自动增长
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    # 查找每个word在dictionary中对应的索引,并将原数据转换为词的索引标号
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

data,count,dictionary,reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)',count[:5])
print('Sample data',data[:20])
#del words


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156, 128, 742, 477, 10572, 134, 1, 27549, 2, 1, 103]


In [31]:
# generate a training batch for the skip-gram model
data_index = 0 

def generate_batch(batch_size,num_skips,skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size),dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)#创建一个双端队列
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0,span-1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips +j,0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

print('data:',[reverse_dictionary[di] for di in data[:8]])

for num_skips,skip_window in [(1,1),(2,1),(4,2)]:
    data_index = 0
    batch,labels = generate_batch(batch_size=8,
                                  num_skips=num_skips,
                                  skip_window=skip_window)
    print('\nWith num_skips = {0} and skip_window = {1}'.format(num_skips,skip_window))
    print('    batch:',[reverse_dictionary[bi] for bi in batch])
    print('    labels:',[reverse_dictionary[li] for li in labels.reshape(8)])
    for i in range(8):
        print(batch[i],'->',labels[i,0],'<===>',
            reverse_dictionary[batch[i]],'->',reverse_dictionary[labels[i,0]])

data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

With num_skips = 1 and skip_window = 1
    batch: ['originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used']
    labels: ['anarchism', 'a', 'term', 'a', 'term', 'of', 'used', 'against']
3084 -> 5239 === originated -> anarchism
12 -> 6 === as -> a
6 -> 195 === a -> term
195 -> 6 === term -> a
2 -> 195 === of -> term
3137 -> 2 === abuse -> of
46 -> 59 === first -> used
59 -> 156 === used -> against

With num_skips = 2 and skip_window = 1
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['anarchism', 'as', 'a', 'originated', 'as', 'term', 'a', 'of']
3084 -> 5239 === originated -> anarchism
3084 -> 12 === originated -> as
12 -> 6 === as -> a
12 -> 3084 === as -> originated
6 -> 12 === a -> as
6 -> 195 === a -> term
195 -> 6 === term -> a
195 -> 2 === term -> of

With num_skips = 4 and skip_window = 2
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels

In [26]:
# Train a skip-gram model
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector
skip_window = 1 # How many words to consider left and right
num_skips = 2 # How many times to reuse an input to generate a label

valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window),valid_size))
num_sampled = 64

graph = tf.Graph()

with graph.as_default(),tf.device('/cpu:0'):
    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
    # Variables.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(softmax_weights, 
                                   softmax_biases, 
                                   embed,
                                   train_labels, 
                                   num_sampled, 
                                   vocabulary_size))

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities 
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    

In [27]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(
          batch_size, num_skips, skip_window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
    if step % 2000 == 0:
        if step > 0:
            average_loss = average_loss / 2000
        # The average loss is an estimate of the loss over the last 2000 batches.
        print('Average loss at step %d: %f' % (step, average_loss))
        average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
        sim = similarity.eval()
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8 # number of nearest neighbors
            nearest = (-sim[i, :]).argsort()[1:top_k+1]
            log = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
            log = '%s %s,' % (log, close_word)
        print(log)
    final_embeddings = normalized_embeddings.eval()

Initialized


KeyboardInterrupt: 