예제

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/word2vec/word2vec_basic.py

tensorflow 문서를 보면

이후에 튜토리얼에서는 코드를 보여줄 것이나, 좀 더 자세히 알고 싶다면 tensorflow/examples/tutorials/word2vec/word2vec_basic.py 의 최소화된 구현을 참고하자. 이 기본 예제는 특정 데이터를 다운로드 하기 위해 필요한 코드, 이것을 약간 학습하기 위한 코드, 그리고 결과를 시각화하기 위하 코드를 포함한다. 기본 버전을 읽고 실행하는데 익숙해지면, 쓰레드를 이용하여 어떻게 효율적으로 데이터를 텍스트 모델로 이동시키는지, 학습하는 동안 어떻게 체크하는지 등에 대한 좀 더 심화된 TensorFlow 원리들을 보여주는 심화 구현된 tensorflow_models/tutorials/embedding/word2vec.py 을 시작할 수 있다.


그래서 위 예제를 먼저 시작했다.

모르는 것을 검색하며 보다가 좋은 자료를 많이 찾았는데

> 좀 더 상세하고 자세히 코드리뷰를 한 http://pythonkim.tistory.com/93 블로그.

> Word Embedding이나 기본적인 이론에서 관한 부분이 잘 정리된 기본 텐서플로우 문서.  
> https://tensorflowkorea.gitbooks.io/tensorflow-kr/g3doc/tutorials/word2vec/

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception(
        'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

vocabulary = read_data(filename)
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0


# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.
  init = tf.global_variables_initializer()

# Step 5: Begin training.
num_steps = 100001

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = '%s %s,' % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()

# Step 6: Visualize the embeddings.


def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

try:
  # pylint: disable=g-import-not-at-top
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
  plot_only = 500
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels)

except ImportError:
  print('Please install sklearn, matplotlib, and scipy to show embeddings.')

Found and verified text8.zip
Data size 17005207
Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5244, 3083, 12, 6, 195, 2, 3136, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
3083 originated -> 5244 anarchism
3083 originated -> 12 as
12 as -> 3083 originated
12 as -> 6 a
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of
Initialized
Average loss at step  0 :  274.317626953
Nearest to with: saito, moustache, webpages, pipelined, babbitt, occultism, theosophical, bryozoa,
Nearest to which: widows, adnan, macroeconomics, cricetulus, ran, emphasises, effluents, alpher,
Nearest to who: wronged, blocked, relena, legislative, vogue, lasker, doppelbock, diss,
Nearest to at: brinkley, receiver, veracruz, carolingian, sputtering, discounted, kola, triune,
Nearest to been: gallipoli, salon, quentin, widow, pricing, extremes, visionaries, adb,
Nearest to w

기네요...

Step 별로 해석을 해보겠습니다...

### Step 1: Download the data.

다운로드는 패스하고 자료 형태 확인

In [2]:
len(vocabulary)

NameError: name 'vocabulary' is not defined

아래 소스에 메모리 이슈로 `del vocabulary` 을 해서 not defined가 뜸. 이런 팁은 배워둘만 한 것 같음

In [3]:
vocabulary = read_data(filename)

In [4]:
len(vocabulary)

17005207

In [8]:
vocabulary[0]

'anarchism'

In [10]:
vocabulary[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

아, 엄청 큰 글을 그냥 단어로 나눠버린거구나.. 사실 원한 그림은 paragraph2vec과 같은 거였는데 지금은 패스

### Step 2: Build the dictionary and replace rare words with UNK token.

UNK는 unknown의 약자란다.

In [14]:
words = vocabulary
n_words = 50000

In [15]:
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(n_words - 1))

In [16]:
len(count)

50000

In [18]:
count[:5]

[['UNK', -1],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764)]

단어의 분포수겠지. 굳이 정말인가 확인해보려면...

In [23]:
i = 0
for w in words:
    if w == 'the':
        i += 1

In [24]:
i

1061396

네 맞습니다...

In [25]:
dictionary = dict()
for word, _ in count:
    dictionary[word] = len(dictionary)

In [32]:
print (dictionary['UNK'], dictionary['the'], dictionary['of'], dictionary['and'])

0 1 2 3


단어에 차례대로 숫자를 부여하는 부분이고

In [38]:
data = list()
unk_count = 0
for word in words:
    if word in dictionary:
        index = dictionary[word]
    else:
        index = 0  # dictionary['UNK']
        unk_count += 1
    data.append(index)
count[0][1] = unk_count

In [47]:
words[:3]

['anarchism', 'originated', 'as']

In [48]:
data[:3]

[5244, 3083, 12]

In [49]:
print (dictionary['anarchism'], dictionary['originated'], dictionary['as'])

5244 3083 12


저 부분은 words (vocabulary)의 각 단어에다가 위에서 부여한 단어의 번호(인덱스)를 부여하는 부분이다.


이게 어디서 쓰이는지는 밑에서 봐야겠다.

In [50]:
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

In [57]:
reverse_dictionary

{0: 'UNK',
 1: 'the',
 2: 'of',
 3: 'and',
 4: 'one',
 5: 'in',
 6: 'a',
 7: 'to',
 8: 'zero',
 9: 'nine',
 10: 'two',
 11: 'is',
 12: 'as',
 13: 'eight',
 14: 'for',
 15: 's',
 16: 'five',
 17: 'three',
 18: 'was',
 19: 'by',
 20: 'that',
 21: 'four',
 22: 'six',
 23: 'seven',
 24: 'with',
 25: 'on',
 26: 'are',
 27: 'it',
 28: 'from',
 29: 'or',
 30: 'his',
 31: 'an',
 32: 'be',
 33: 'this',
 34: 'which',
 35: 'at',
 36: 'he',
 37: 'also',
 38: 'not',
 39: 'have',
 40: 'were',
 41: 'has',
 42: 'but',
 43: 'other',
 44: 'their',
 45: 'its',
 46: 'first',
 47: 'they',
 48: 'some',
 49: 'had',
 50: 'all',
 51: 'more',
 52: 'most',
 53: 'can',
 54: 'been',
 55: 'such',
 56: 'many',
 57: 'who',
 58: 'new',
 59: 'used',
 60: 'there',
 61: 'after',
 62: 'when',
 63: 'into',
 64: 'american',
 65: 'time',
 66: 'these',
 67: 'only',
 68: 'see',
 69: 'may',
 70: 'than',
 71: 'world',
 72: 'i',
 73: 'b',
 74: 'would',
 75: 'd',
 76: 'no',
 77: 'however',
 78: 'between',
 79: 'about',
 80: 'over'

### Step 3: Function to generate a training batch for the skip-gram model.

In [58]:
batch_size=8
num_skips=2
skip_window=1

In [59]:
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window

assert는 예외처리를 나타내는 거라고 생각하면 된단다.  
http://hashcode.co.kr/questions/958/assert%EB%8A%94-%EC%96%B8%EC%A0%9C-%EC%93%B0%EB%82%98%EC%9A%94

In [72]:
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1  # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)

In [73]:
batch, labels

(array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), array([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]], dtype=int32))

np.ndarray는 numpy에서 쓰는 n차월 배열객체.  
참조 http://yujuwon.tistory.com/entry/NumPy

In [173]:
data_index = 0
for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)

In [158]:
span, buffer, data_index

(3, deque([5244, 3083, 12]), 3)

In [159]:
data[:6]

[5244, 3083, 12, 6, 195, 2]

저 for문을 계속 돌리면 buffer에 3개씩 data의 값들이 순차적으로 들어가게 된다.

실제로 저 for문이 있는 generate_batch 함수가 Step5에서 사용되고 있다.

In [175]:
# for i in range(batch_size // num_skips):
#     target = skip_window  # target label at the center of the buffer
#     targets_to_avoid = [skip_window]
#     for j in range(num_skips):
#         while target in targets_to_avoid:
#             target = random.randint(0, span - 1)
#         targets_to_avoid.append(target)
#         batch[i * num_skips + j] = buffer[skip_window]
#         labels[i * num_skips + j, 0] = buffer[target]
#     buffer.append(data[data_index])
#     data_index = (data_index + 1) % len(data)

for i in range(batch_size // num_skips):
    print(i)
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    print ("targets_to_avoid : {}".format(targets_to_avoid))
    for j in range(num_skips):
        print (i, j)
        while target in targets_to_avoid:
            target = random.randint(0, span - 1)
        print ("target : {}".format(target))
        targets_to_avoid.append(target)
        print ("targets_to_avoid : {}".format(targets_to_avoid))
        batch[i * num_skips + j] = buffer[skip_window]
        labels[i * num_skips + j, 0] = buffer[target]
    print("data_index : {}".format(data_index))
    print("data[data_index] : {}".format(data[data_index]))
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)

0
targets_to_avoid : [1]
0 0
target : 0
targets_to_avoid : [1, 0]
0 1
target : 2
targets_to_avoid : [1, 0, 2]
data_index : 3
data[data_index] : 6
1
targets_to_avoid : [1]
1 0
target : 2
targets_to_avoid : [1, 2]
1 1
target : 0
targets_to_avoid : [1, 2, 0]
data_index : 4
data[data_index] : 195
2
targets_to_avoid : [1]
2 0
target : 0
targets_to_avoid : [1, 0]
2 1
target : 2
targets_to_avoid : [1, 0, 2]
data_index : 5
data[data_index] : 2
3
targets_to_avoid : [1]
3 0
target : 2
targets_to_avoid : [1, 2]
3 1
target : 0
targets_to_avoid : [1, 2, 0]
data_index : 6
data[data_index] : 3136


In [176]:
data[3], data[4], data[5], data[6]

(6, 195, 2, 3136)

In [177]:
buffer

deque([195, 2, 3136])

In [178]:
# Backtrack a little bit to avoid skipping words in the end of a batch
data_index = (data_index + len(data) - span) % len(data)

In [179]:
data_index

4

In [180]:
batch

array([3083, 3083,   12,   12,    6,    6,  195,  195], dtype=int32)

In [181]:
labels

array([[5244],
       [  12],
       [   6],
       [3083],
       [  12],
       [ 195],
       [   2],
       [   6]], dtype=int32)

결국 마지막에 generate_batch 함수는 batch, labels를 return 한다.

와, 하나도 이해가 안되는데...

In [183]:
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],'->', labels[i, 0], reverse_dictionary[labels[i, 0]])

3083 originated -> 5244 anarchism
3083 originated -> 12 as
12 as -> 6 a
12 as -> 3083 originated
6 a -> 12 as
6 a -> 195 term
195 term -> 2 of
195 term -> 6 a


In [185]:
reverse_dictionary[3083], reverse_dictionary[5244]

('originated', 'anarchism')

In [186]:
data[:7]

[5244, 3083, 12, 6, 195, 2, 3136]

아아!!  
batch, labels에는 batch_size만큼 크기가 할당되어 있고,  
for문 출력한걸 보자하니 skip_window만큼 앞 뒤 단어를 가져와서 일종의 쌍을 만들어주는 함수다.

[5244, 3083, 12] 가 연결되어 있으니 3083 -> 5244, 3083 -> 12가 출력되어 있고 계속 그런식...

In [187]:
# for i in range(batch_size // num_skips):
#     target = skip_window  # target label at the center of the buffer
#     targets_to_avoid = [skip_window]
#     for j in range(num_skips):
#         while target in targets_to_avoid:
#             target = random.randint(0, span - 1)
#         targets_to_avoid.append(target)
#         batch[i * num_skips + j] = buffer[skip_window]
#         labels[i * num_skips + j, 0] = buffer[target]
#     buffer.append(data[data_index])
#     data_index = (data_index + 1) % len(data)

# 결국 밑에 2줄은 위에서 In[173] 과 같은 부분이며 역할은 일종의 초기화 작업을 해 준 것이다.
# 파라미터는 batch_size = 8, num_skips = 2, skip_window = 1 이다.
# targets_to_avoid에는 skip_window. 즉 1이 들어가있고 그 1은 자기 자신을 뜻한다.
# j를 통한 for문을 통해 num_skips만큼 앞뒤의 단어를 가져와 batch에는 자신의 단어, labels에는 앞, 뒤 단어를 넣게 된다.

### Step 4: Build and train a skip-gram model.

In [188]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

In [193]:
with graph.as_default():

    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
                        inputs=embed,
                        num_sampled=num_sampled,
                        num_classes=vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.global_variables_initializer()

tf.placeholder는 텐서플로우 자료형의 하나로 여러 학습 데이터를 답는 공간이다.

조대협의 텐서플로우, 자료형의 이해 - http://bcho.tistory.com/1150

In [194]:
vocabulary_size, embedding_size

(50000, 128)

random_uniform는 난수 생성기.  
tf.nn.embedding_lookup의 nn은 뉴럴네트웍스인 것 같고 embedding_looup은 임베딩 계산을 위한 부분으로 보인다.  
문서 (https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) 참조.

nce_weights, nce_biases 둘 다 nce loss계산을 위한 변수인데  
전부 reduce_mean, nce_loss에서 쓰인다. nce 손실함수를 사용하는 것이다.

이에 대한 개념은 위에서도 언급했지만 http://solarisailab.com/archives/374 이 블로그가 도움이 되었다.

SGD optimizer를 하는 부분에 결국 gradient descent에 관련된 부분인데 optimization 대한 이론은  
http://shuuki4.github.io/deep%20learning/2016/05/20/Gradient-Descent-Algorithm-Overview.html 여기에 잘 나와있다.

마지막은 minibatch 데이터와 모든 임베딩간의 코사인 유사도를 구하는 부분이다.

** Q. 코사인유사도가 아닌 다른 방법을 쓸 수 있을까? 유클리드나 ts-ss... **

### Step 5: Begin training.

In [7]:
num_steps = 100001

In [8]:
session = tf.Session(graph=graph)

In [11]:
with tf.Session(graph=graph) as session:
    init.run()
    print('Initialized')

Initialized


In [19]:
with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  251.43145752
Nearest to after: nonce, vintage, four, smile, encoding, powerplant, helga, silos,
Nearest to first: hank, pearse, ut, kevin, sentries, ishaq, advisers, triads,
Nearest to such: gomes, bertha, recounts, daytona, fishermen, clemens, jacobus, tested,
Nearest to had: red, regicide, elo, bfi, kluwer, homelessness, nada, rob,
Nearest to one: nuanced, homographs, dacia, rewriting, scarp, dialectical, medici, cider,
Nearest to used: och, electromagnetic, replicated, gaming, manual, universiteit, reverence, wied,
Nearest to which: klm, wynn, graziani, maidens, cmt, copernicus, diarrhoea, appellant,
Nearest to world: chiles, magee, land, customers, kwahu, osaka, poudre, oui,
Nearest to may: ascribes, clip, cheated, incurred, inductors, swordsmen, yoshi, intrepid,
Nearest to no: redundantly, kilobit, ligature, gottfried, lexis, pfa, lowercase, brahmic,
Nearest to american: goryeo, overseas, sara, leif, pesos, adad, vicious, saya,
Nearest to abo

### Step 6: Visualize the embeddings.

6은 패스...  
아직 내가 당장 필요하지는 않다.

### 후기

솔직히 어렵다... 아무것도 없는 상태에서 이런걸 짜라고 하면 짤 수 있을까?...  
자신이 없다