In [1]:
import numpy as np

import tensorflow as tf
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
data = [
    'раз два три четыре пять шесть семь',
    'раз два пять восемь'
    
]

In [3]:
tokenizer = Tokenizer()

In [4]:
tokenizer.fit_on_texts(data)

In [5]:
tokenizer.word_index['NULL'] = 0
tokenizer.word_index

{'раз': 1,
 'два': 2,
 'пять': 3,
 'три': 4,
 'четыре': 5,
 'шесть': 6,
 'семь': 7,
 'восемь': 8,
 'NULL': 0}

In [6]:
embeddings = dict()

for w, idx in tokenizer.word_index.items():
    if embeddings.get(idx) is None:
        embeddings[idx] = np.random.normal(size=(8,)) 

In [7]:
pre_embedding = np.array([embeddings[i] for i in range(len(embeddings))])

In [8]:
def window_generator(text, le, ri):
    """
    text: <np.array> [n_words x embed_size]
    le: <int>: left window
    ri: <int>: right window
    """
    
    # `NULL` SIDE-PADDING
    # word_index['NULL'] = 0
        
    text = [0] * le + text + [0] * ri 
    
    # STREAMING BATCHES
    # AS (center_word_idx, context_word_idx) PAIRS 
    
    for i in range(le, len(text)-ri):
        yield (text[i], tuple(text[i-le:i] + text[i+1:i+1+ri]))
        
        

def batch_generator(data, tokenizer, embeddings, window_size=(1, 1)):
    for text in tokenizer.texts_to_sequences_generator(data):
        # text = np.array([embeddings[w_idx] for w_idx in text])
        
        yield from window_generator(text, *window_size)
        
    

In [8]:
tokenizer.texts_to_sequences(data)

[[1, 2, 4, 5, 3, 6, 7], [1, 2, 3, 8]]

In [55]:
for x in batch_generator(data, tokenizer, embeddings, window_size=(1, 1)): print(x)

(1, (0, 2))
(2, (1, 4))
(4, (2, 5))
(5, (4, 3))
(3, (5, 6))
(6, (3, 7))
(7, (6, 0))
(1, (0, 2))
(2, (1, 3))
(3, (2, 8))
(8, (3, 0))


In [101]:
tf.reset_default_graph()
tf_sess.close()

In [102]:
# window_left + window_right
tf_vocab_size = 9
tf_fitting_embed_size = 8
tf_batch_size = 1
tf_window_size = 2
tf_rnn_state_size = 16
tf_embedding_dense_size = 32
tf_pre_embedding = tf.constant(pre_embedding, dtype=tf.float32)

tf_input_context = tf.placeholder(dtype=tf.int32, shape=(tf_batch_size, tf_window_size))
tf_input_labels = tf.placeholder(dtype=tf.int32, shape=(tf_batch_size,))

tf_fitting_embedding = tf.Variable(tf.truncated_normal(shape=(tf_vocab_size, tf_fitting_embed_size), stddev=0.1))
tf_W_out = tf.Variable(tf.truncated_normal(shape=(tf_rnn_state_size * 2 + tf_fitting_embed_size, tf_vocab_size), stddev=0.1))
tf_b_out = tf.Variable(tf.zeros(shape=(tf_vocab_size,)))


# EMBEDDING TENSOR [batch_size x window_size x pre_embed_size]

with tf.device("/cpu:0"):
    tf_context_pre_embed = tf.nn.embedding_lookup(tf_pre_embedding, tf_input_context)

tf_rnn_cell_fw = tf.nn.rnn_cell.LSTMCell(16)
tf_rnn_cell_bw = tf.nn.rnn_cell.LSTMCell(16)
tf_rnn_state = tf_rnn_cell.zero_state(tf_batch_size, dtype=tf.float32)

tf_rnn_outputs, tf_rnn_states = tf.nn.bidirectional_dynamic_rnn(
    dtype=tf.float32,
    cell_fw=tf_rnn_cell_fw,
    cell_bw=tf_rnn_cell_bw,
    inputs=tf_context_pre_embed
)

with tf.device("/cpu:0"):
    tf_context_fitting_embed = tf.nn.embedding_lookup(tf_fitting_embedding, tf_input_labels)

# CONCAT FW AND BW HIDDEN STATES AND FITTING EMBED OF TARGETS [batch_size x HIDDEN_STATE_SIZE * 2 + FITTING_EMBED_SIZE]

tf_final_state = tf.concat(
    [tf_rnn_states[0].c, tf_rnn_states[1].c, tf_context_fitting_embed],
    axis=1
)

tf_logits = tf.matmul(tf_final_state, tf_W_out) + tf_b_out

tf_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=tf_input_labels,
    logits=tf_logits
)

In [103]:
tf_init_op = tf.global_variables_initializer()
tf_sess = tf.Session()
tf_sess.run(tf_init_op)

In [104]:
r = tf_sess.run(
        tf_final_state,
        {tf_input_context: np.array([[0, 2]]), tf_input_labels: np.array([1])}
)

In [105]:
r.shape

(1, 40)

In [40]:
r.c

array([[ 0.05075238, -0.2744412 , -0.23570031, -0.01703531, -0.00418064,
         0.34944135,  0.30708185, -0.15840204,  0.632811  , -0.20224315,
         0.20672935, -0.3125187 ,  0.16155767,  0.38695666, -0.1943923 ,
         0.20426655]], dtype=float32)

In [47]:
len(embeddings)

9

In [72]:
tf.concat(
    [tf.constant(np.array([[1]])), tf.constant(np.array([[2]]))], 1
).eval(session=tf_sess)

array([[1, 2]])