In [2]:
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf

In [3]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [4]:
words = read_data('C:/Users/lejon/tensorLearn/text8.zip')
print('Data size',len(words))

Data size 17005207


In [5]:
vocabulary_size = 50000
def build_dataset(words):
    count = [['UNK',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word,_ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary

In [6]:
data,count,dictionary,reverse_dictionary = build_dataset(words)
del words
print('Most common words (+UNK)',count[:5])
print('Sample data',data[:10],[reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [7]:
data_index = 0
def generate_batch(batch_size,num_skips,skip_window):
    global data_index
    assert batch_size % num_skips==0
    assert num_skips <= 2*skip_window
    batch = np.ndarray(shape=(batch_size),dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)
    span = 2*skip_window + 1
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index+1)%len(data)
    for i in range(batch_size//num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0,span-1)
            targets_to_avoid.append(target)
            batch[i*num_skips+j] = buffer[skip_window]
            labels[i*num_skips+j,0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index+1)%len(data)
    return batch,labels

In [8]:
batch,labels = generate_batch(batch_size=8,num_skips=2,skip_window=1)

In [9]:
for i in range(8):
    print(batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]])

3081 originated -> 5234 anarchism
3081 originated -> 12 as
12 as -> 6 a
12 as -> 3081 originated
6 a -> 195 term
6 a -> 12 as
195 term -> 2 of
195 term -> 6 a


In [10]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window,valid_size,replace=False)
num_sampled = 64

In [11]:
graph = tf.Graph()
with graph.as_default():
    train_inputs = tf.placeholder(tf.int32,shape=[batch_size])
    train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
    valid_dataset = tf.constant(valid_examples,dtype=tf.int32)

    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
    embed = tf.nn.embedding_lookup(embeddings,train_inputs) # 选取train_inouts对应的embedding

    nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels = train_labels,
                                         inputs=embed,
                                         num_sampled=num_sampled,
                                         num_classes=vocabulary_size))

    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # 学习率为1.0
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True)) # tf.reduce_sum(a,1)降维求和，行求和;tf.sqrt求平方根
    normalized_embeddings = embeddings/norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset) # 按照valid_dataset对normalized_embeddings排序
    similarity = tf.matmul(valid_embeddings,normalized_embeddings,transpose_b=True)
    init = tf.global_variables_initializer()

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
num_steps = 100001
with tf.Session(graph=graph) as session:
    init.run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_inputs,batch_labels = generate_batch(batch_size,num_skips,skip_window)
        feed_dict = {train_inputs:batch_inputs,train_labels:batch_labels}
        _,loss_val = session.run([optimizer,loss],feed_dict=feed_dict)
        average_loss += loss_val
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print('Average loss at step',step,":",average_loss)
            average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i,:]).argsort()[1:top_k+1]
                log_str = "Nearest to %s:"%valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s,"%(log_str,close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0 : 261.0079345703125
Nearest to see: accorded, eschenbach, billy, divide, townsfolk, watching, diaries, conjunctive,
Nearest to i: connoisseurs, sockets, zia, interchangeably, osz, whitehall, kirwan, manufactures,
Nearest to it: converts, paget, preemptive, academic, refrigeration, boer, berio, galvanized,
Nearest to history: none, ooze, quae, instructive, noise, walsh, monistic, chaos,
Nearest to no: cooks, ger, control, piecemeal, airport, murakami, nissan, tivoli,
Nearest to more: abstraction, holloway, bob, mclean, gasses, exhausting, mint, dependency,
Nearest to from: intra, dribble, speer, milestones, unreal, proficiency, andrewes, pharyngeal,
Nearest to two: bitumen, aware, send, prior, ursinus, hydrogenation, samplers, undetectable,
Nearest to in: taland, gambia, greeted, rebuked, hdi, withheld, jive, underarm,
Nearest to war: archer, napoleonic, amazed, praise, loan, tribes, equalizer, claimant,
Nearest to they: tullius, idiopathic, look, iamb

Average loss at step 52000 : 5.166574693441391
Average loss at step 54000 : 5.135505759596825
Average loss at step 56000 : 5.071027046322823
Average loss at step 58000 : 5.1111913626194
Average loss at step 60000 : 4.939165663301945
Nearest to see: mishnayot, ssbn, but, prism, watching, and, michelob, paints,
Nearest to i: warships, we, connoisseurs, ii, g, interferon, arts, microbats,
Nearest to it: he, this, there, prism, which, they, saguinus, tamarin,
Nearest to history: aquila, agouti, dasyprocta, pathogen, wct, akh, recitative, hemoglobin,
Nearest to no: blockade, pietro, some, a, cooks, airport, control, primigenius,
Nearest to more: less, parity, cc, dependency, otimes, expansion, microcebus, vous,
Nearest to from: in, into, and, vernal, antiderivative, by, speer, on,
Nearest to two: three, four, six, five, one, eight, seven, tamarin,
Nearest to in: at, and, from, on, aba, agouti, bos, tamarin,
Nearest to war: library, tribes, napoleonic, agouti, beeb, naaman, phi, exterminate,

In [None]:
def plot_with_labels(low_dim_embs,labels,filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels),'More labels than embeddings'
    import matplotlib.pyplot as plt
    plt.figure(figsize=(18,18))
    for i,label in enumerate(labels):
        x,y = low_dim_embs[i,:]
        plt.scatter(x,y)
        plt.annotate(label,
                     xy=(x,y),
                     xytext=(5,2),
                     textcoords='offset points',
                     ha='right',va='bottom')
    plt.savefig(filename)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
plot_only = 100
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,labels)

In [None]:
plt.show()