In [76]:
import tensorflow as tf
import zipfile
import urllib
import os
import collections
import numpy as np
import random

In [40]:
# data

url = 'http://mattmahoney.net/dc/text8.zip'
data_path = '/tmp/text8.zip'
if not os.path.exists(data_path):
    name, _ = urllib.request.urlretrieve(url)
    print('download')

In [44]:
# unzip
with zipfile.ZipFile(data_path) as z:
    text_words = z.read(z.namelist()[0]).lower().split()

In [59]:
max_len = 1000
min_cur = 2

In [60]:
count

[('UNK', 15350134), (b'the', 1061396), (b'of', 593677)]

In [68]:
count = [('UNK', -1)]
count.extend(collections.Counter(text_words).most_common(max_len - 1))

for i in range(len(count)-1, -1, -1):
    if count[i][1]<min_cur:
        count.pop(i)
    else:
        break

vocabulary_size = len(count)
word2id = dict()
for i, (w, _) in enumerate(count):
    word2id[w] = i
    
data=[]
unk = 0
for w in text_words:
    ind = word2id.get(w, 0)
    if ind == 0:
        unk += 1
    data.append(ind)
    
count[0]=('UNK', unk)
id2word = dict(zip(word2id.values(), word2id.keys()))

In [86]:
data_index = 0
# Generate training batch for the skip-gram model. skip_gram 是一推多
def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # get window size (words left and right + current one).
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
#     print(data_index)
    buffer.extend(data[data_index:data_index + span])
#     print(buffer)
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
#         print("----\n", i)
#         print(buffer)
#         print(data_index)
#         print(batch)
#         print(labels)
        
    # Backtrack a little bit to avoid skipping words in the end of a batch.
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [90]:
dim = 16
num_sampled = 4

In [162]:
##
embedding = tf.Variable(tf.random.normal([vocabulary_size, dim]))
nce_weight = tf.Variable(tf.random.normal([vocabulary_size, dim]))
nce_biase = tf.Variable(tf.zeros([vocabulary_size]))

def lookup_embedding(w):
    return tf.nn.embedding_lookup(embedding, w)

# loss
def nce_loss(logits, y):
    y = tf.cast(y, tf.int64)
    loss = tf.reduce_mean(
        tf.nn.nce_loss(
            weights=nce_weight,
            biases=nce_biase,
            labels=y,
            inputs=logits,
            num_sampled= num_sampled,
            num_classes=vocabulary_size))
    return loss

# eva
def evaluate(x_emb, y):
    logits = tf.matmul(x_emb, nce_weight, transpose_b=True)
    logits = tf.add(logits, nce_biase)
    y_ont_hot =  tf.squeeze(tf.one_hot(y, vocabulary_size), axis=1)
    print('logits', logits.shape, 'y', y_ont_hot.shape)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
    labels=y_ont_hot,
    logits=logits)
    loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1))
    
    #cosine
    x_embed = tf.cast(x_emb, tf.float32)
    x_embed_norm = x_embed/tf.sqrt(tf.reduce_sum(tf.square(x_embed)))
    embedding_norm = embedding/tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)
    cosine = tf.matmul(x_embed, embedding_norm, transpose_b=True)
    return loss, cosine

#optimizer
optimizer=tf.optimizers.SGD(0.01)

In [155]:
def train_step(x, y):
    with tf.GradientTape() as tape:
        x_emb = lookup_embedding(x)
        loss = nce_loss(x_emb, y)
        print('losss', loss)
        
    grads = tape.gradient(loss, [embedding, nce_weight, nce_biase])
    optimizer.apply_gradients(zip(grads, [embedding, nce_weight, nce_biase]))
    return loss

In [156]:
batch_size = 64
num_skips = 2
skip_windows = 2

In [163]:
for i in range(10):
    (x_train, y_train) = next_batch(batch_size, num_skips, skip_windows)
    loss = train_step(x_train, y_train)
    print('train loss:', loss)
    
    (x_test, y_test) = next_batch(batch_size, num_skips, skip_windows)
    x_emb = lookup_embedding(x_test)
    loss = nce_loss(x_emb, y_test)
    print('test loss:', loss)
    
    loss, cos = evaluate(x_emb, y_test)
    print('loss', loss, 'cos', cos)

losss tf.Tensor(19.069523, shape=(), dtype=float32)
train loss: tf.Tensor(19.069523, shape=(), dtype=float32)
test loss: tf.Tensor(19.020855, shape=(), dtype=float32)
logits (64, 1000) y (64, 1000)
loss tf.Tensor(1792.5254, shape=(), dtype=float32) cos tf.Tensor(
[[ 4.2837744e+00  1.2748723e+00 -1.0769682e+00 ...  4.0832368e-01
  -6.1532879e-01 -2.4855095e-01]
 [ 4.2837744e+00  1.2748723e+00 -1.0769682e+00 ...  4.0832368e-01
  -6.1532879e-01 -2.4855095e-01]
 [-1.8320662e+00 -1.7449287e+00 -7.1548325e-01 ...  1.4244804e-01
   5.1303184e-01 -5.6311756e-01]
 ...
 [-1.1685427e+00 -1.6191188e+00 -3.5412398e-01 ...  1.2623150e+00
  -4.4060358e-01 -8.9376330e-02]
 [ 4.1598555e-01  1.9969952e-01  4.8482502e-01 ... -9.2368358e-01
  -1.5941393e-01 -2.8106570e-03]
 [ 4.1598555e-01  1.9969952e-01  4.8482502e-01 ... -9.2368358e-01
  -1.5941393e-01 -2.8106570e-03]], shape=(64, 1000), dtype=float32)
losss tf.Tensor(24.662834, shape=(), dtype=float32)
train loss: tf.Tensor(24.662834, shape=(), dtype=f