In [1]:
import helper as utils
import tensorflow as tf
import numpy as np
import collections
import random
import math
import re
import os

from nltk.corpus import stopwords

from tensorflow.contrib.tensorboard.plugins import projector

# 1.
Load data

In [2]:
import pandas as pd
import pickle

In [3]:
trainX, _, testX, _ = utils.load_data('reviews_rus.pkl')

In [4]:
corpus = trainX + testX

In [5]:
corpus = [list(filter(lambda x: x != '' and len(x) > 1, re.split('[^а-я]', line.lower()))  ) for line in corpus]

In [6]:
corpus = [word for sent in corpus for word in sent]

In [7]:
print(corpus[:10])
print(len(corpus))

['дизайн', 'приятные', 'габариты', 'удобно', 'лежит', 'руке', 'бесплатные', 'приложения', 'игры', 'хотя']
1527876


In [8]:
rus_stops = stopwords.words('russian')

In [9]:
corpus = [word for word in corpus if word not in rus_stops]

In [10]:
from collections import Counter

In [11]:
vocab = Counter(corpus)

In [12]:
len(vocab)

51809

# 2.
Build the dictionary and replace rare words with `UNK`token

In [13]:
vocab_size = 1000

In [14]:
data, i2w = utils.build_dataset(corpus, vocab_size)

In [15]:
print('Sample data', data[:10], [i2w[i] for i in data[:10]])

Sample data [23, 0, 0, 27, 95, 65, 0, 29, 59, 21] ['дизайн', 'UNK', 'UNK', 'удобно', 'лежит', 'руке', 'UNK', 'приложения', 'игры', 'хотя']


In [16]:
def gen_batch(batch_size, num_skips, wing):
    x, y = [],[]
    assert batch_size <= len(data) - 2*wing
    assert batch_size % num_skips == 0

    target_idcs = random.sample(range(wing,len(data)-wing), batch_size//num_skips)
    for t_idx in target_idcs:
        for _ in range(num_skips):
            c_idx = random.sample( list(range (t_idx - wing, t_idx) ) + list( range (t_idx+1, t_idx+wing+1) ),1 )
            x.append(data[t_idx])
            y.append(data[c_idx[0]])

    x = np.array(x, dtype=np.int32)
    y = np.array(y, dtype=np.int32).reshape(-1,1)

    return x, y

In [17]:
batch, labels = gen_batch(batch_size=8, num_skips=2, wing=2)
for b, l in zip(batch,labels): # range(8):
    print(b, i2w[b], '->', l[0],
          i2w[l[0]])

129 кнопка -> 81 тормозит
129 кнопка -> 0 UNK
46 пользуюсь -> 512 играть
46 пользуюсь -> 512 играть
0 UNK -> 60 крышка
0 UNK -> 24 быстро
400 свой -> 0 UNK
400 свой -> 1 телефон


In [18]:
log_dir = 'reviews_ru_log'
if not os.path.exists(log_dir):
    print('making the directory.')
    os.makedirs(log_dir)

In [19]:
batch_size = 64
embedding_size = 50  # Dimension of the embedding vector.
skip_window = 2  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 32  # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit
# the validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 4  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

tf.reset_default_graph()

graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size],
                                                          stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocab_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                             biases=nce_biases,
                                             labels=train_labels,
                                             inputs=embed,
                                             num_sampled=num_sampled,
                                             num_classes=vocab_size))

    # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
#         optimizer = tf.train.AdamOptimizer().minimize(loss)

    # Compute the cosine similarity between minibatch examples and all
    # embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [20]:
num_steps = 300000

with tf.Session(graph=graph) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(logdir=log_dir, graph=session.graph)
    

    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = gen_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned
        # "summary" variable. Feed metadata variable to session for visualizing
        # the graph in TensorBoard.
        _, summary, loss_val = session.run([optimizer, merged, loss], 
                                           feed_dict=feed_dict,
                                           run_metadata=run_metadata)
        average_loss += loss_val

        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000
            # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 20000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = i2w[valid_examples[i]]
                top_k = 4  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = i2w[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

    # Write corresponding labels for the embeddings.
    with open(os.path.join(log_dir, log_dir, 'metadata.tsv'), 'w') as f:
        for i in range(vocab_size):
            f.write(i2w[i] + '\n')

    # Save the model for checkpoints.
    save_path = os.path.join(log_dir, 'model.ckpt')
#     print(save_path)
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))

    # Create a configuration for visualizing embeddings with the labels in
    # TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    projector_path = os.path.join(log_dir, 'metadata.tsv')
#     print(projector_path)
    embedding_conf.metadata_path = projector_path
    projector.visualize_embeddings(writer, config)

writer.close()

Initialized
Average loss at step  0 :  95.57229614257812
Nearest to нужно: шустро, жутко, обратно, сборка,
Nearest to года: столько, новые, чехле, стоит,
Nearest to работы: лежит, плохо, слышно, возможности,
Nearest to мало: некоторых, жизни, идет, пару,
Average loss at step  2000 :  6.79695642375946
Average loss at step  4000 :  3.7111902314424516
Average loss at step  6000 :  3.6890621777772905
Average loss at step  8000 :  3.6613554170131684
Average loss at step  10000 :  3.6440309468507768
Average loss at step  12000 :  3.6376683564186094
Average loss at step  14000 :  3.6353981994390487
Average loss at step  16000 :  3.6256597045660017
Average loss at step  18000 :  3.611668218612671
Average loss at step  20000 :  3.605453843832016
Nearest to нужно: UNK, телефоны, начинает, это,
Nearest to года: год, стал, двух, телефоном,
Nearest to работы: это, выше, использую, лежит,
Nearest to мало: внутренней, слабый, зарядку, UNK,
Average loss at step  22000 :  3.603331997990608
Average loss

Average loss at step  224000 :  3.481088665604591
Average loss at step  226000 :  3.479821900486946
Average loss at step  228000 :  3.482370329380035
Average loss at step  230000 :  3.4886057900190353
Average loss at step  232000 :  3.4772582433223724
Average loss at step  234000 :  3.4817662984132767
Average loss at step  236000 :  3.4746250557899474
Average loss at step  238000 :  3.481174899339676
Average loss at step  240000 :  3.4814167448282243
Nearest to нужно: приходится, возможность, иначе, понятно,
Nearest to года: месяца, год, полгода, лет,
Nearest to работы: автономность, скорость, работе, аккумулятора,
Nearest to мало: встроенной, нету, внутренней, количество,
Average loss at step  242000 :  3.477795501470566
Average loss at step  244000 :  3.4717944071292877
Average loss at step  246000 :  3.479409617304802
Average loss at step  248000 :  3.474868056178093
Average loss at step  250000 :  3.4774734518527985
Average loss at step  252000 :  3.4704763680696487
Average loss at

In [None]:
with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint(log_dir))
    embed_mat = sess.run(embeddings)

In [None]:
type(embed_mat)

In [None]:
print(embed_mat.shape)

In [None]:
embed_mat[0]