In [169]:
import numpy as np
import re

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Input, Embedding, Dense, LSTM, Bidirectional, Concatenate, Reshape, Lambda
from keras import backend as K

In [100]:
from sklearn.datasets import fetch_20newsgroups

In [110]:
def text_normalize(text):
    text = text.lower()
    text = re.sub('[\n\t\r]', ' ', text)
    text = re.sub(' +', 'SPACEPLACEHOLDER', text)
    text = re.sub('[\W]', '', text)
    text = re.sub('SPACEPLACEHOLDER', ' ', text)
    text = text.strip()
    
    return text

In [104]:
data = fetch_20newsgroups()['data']

In [113]:
data = [text_normalize(text) for text in data[:1000]]

In [2]:
data = [
    'раз два три четыре пять шесть семь',
    'раз два пять восемь'
]

In [115]:
tokenizer = Tokenizer()

In [116]:
tokenizer.fit_on_texts(data)

In [118]:
tokenizer.word_index['NULL'] = 0
#tokenizer.word_index

In [119]:
embeddings = dict()

for w, idx in tokenizer.word_index.items():
    if embeddings.get(idx) is None:
        embeddings[idx] = np.random.normal(size=(8,)) 

In [120]:
pre_embedding = np.array([embeddings[i] for i in range(len(embeddings))])

In [198]:
def window_generator(text, le, ri):
    """
    text: <np.array> [n_words x embed_size]
    le: <int>: left window
    ri: <int>: right window
    """
    
    # `NULL` SIDE-PADDING
    # word_index['NULL'] = 0
        
    text = [0] * le + text + [0] * ri 
    
    # STREAMING BATCHES
    # AS (center_word_idx, context_word_idx) PAIRS 
    
    for i in range(le, len(text)-ri):
        yield (tuple(text[i-le:i] + text[i+1:i+1+ri]), text[i])
        

def sample_negative(true_context, hi):
    sampled_context = list()
    
    while len(sampled_context) < len(true_context):
        sample = np.random.randint(0, hi)

        while sample in true_context:
            sample = np.random.randint(0, hi)
        
        sampled_context.append(sample)
        
    return sampled_context
        
def build_data(data,
               tokenizer,
               window_size=(1, 1),
               neg_sampling_prob=1,
               n_neg_samples=1
              ):
    """
    return: contexts: : [n_pairs x sum(window_size)]
            targets: np.array: [n_pairs,]
    """
    
    contexts = list()
    targets = list()
    labels = list()
    
    for text in tokenizer.texts_to_sequences_generator(data):
        for context, target in window_generator(text, *window_size):
            contexts.append(context)
            targets.append(target)
            labels.append(1)
            
            if np.random.random() < neg_sampling_prob:
                contexts.append(
                    sample_negative(
                        context,
                        len(tokenizer.word_index)
                    )
                )
                targets.append(target)
                labels.append(0)
            
    contexts = np.array(contexts, dtype=np.int32)
    targets = np.array(targets, dtype=np.int32)
    labels = np.array(labels, dtype=np.int32)
      
    return contexts, targets, labels
    
        
def build_data_generator(data, tokenizer, embeddings, window_size=(1, 1)):
    for text in tokenizer.texts_to_sequences_generator(data):
        yield from window_generator(text, *window_size)

In [144]:
np.array([[1,2], [3]])

array([list([1, 2]), list([3])], dtype=object)

In [8]:
tokenizer.texts_to_sequences(data)

[[1, 2, 4, 5, 3, 6, 7], [1, 2, 3, 8]]

In [200]:
CONTEXTS_DATA, TARGETS_DATA, LABELS_DATA = build_data(data[:2], tokenizer, window_size=(1, 1))

In [208]:
CONTEXTS_DATA[:2], TARGETS_DATA[:2], LABELS_DATA[:2]

(array([[   0, 8809],
        [3615, 9688]], dtype=int32),
 array([13, 13], dtype=int32),
 array([1, 0], dtype=int32))

In [129]:
#for x in batch_generator(data, tokenizer, embeddings, window_size=(1, 1)): print(x)

In [164]:
VOCAB_SIZE = len(tokenizer.word_index)
PRE_EMBED_SIZE = 8
TR_EMBED_SIZE = 8
BATCH_SIZE = 1
WINDOW_SIZE = 2
RNN_STATE_SIZE = 16

In [189]:
input_context = Input(
    shape=(WINDOW_SIZE,),
    dtype='int32',
    name='INPUT_CONTEXTS'
)

input_labels = Input(
    shape=(1,),
    dtype='int32',
    name='INPUT_LABELS'
)

# PRE_TRAINED EMBEDDINGS 

pre_embed = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=PRE_EMBED_SIZE,
    weights=[pre_embedding],
    trainable=False,
    name='PRE_EMBEDDINGS'
)(input_context)

# TRAINABLE EMBEDDINGS 

tr_embed = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=TR_EMBED_SIZE,
    name='TRAINABLE_EMBEDDINGS'
)(input_labels)

# COLLAPSE TRAINABLE EMBEDDINGS TO SHAPE: [batch_size x TR_EMBED_SIZE]

tr_embed = Reshape(
    target_shape=(TR_EMBED_SIZE,),
    name='COLLAPSED_TRAINABLE_EMBEDDINGS'
)(tr_embed)

# CONCAT TRAINABLE AND PRE-TRAINED EMBEDDINGS WITHIN ENCODER 

tr_embed_to_encoder = Lambda(
    lambda x: K.repeat(x, WINDOW_SIZE),
    name='TRAINABLE_EMBEDDINGS_LAMBDA'
)(tr_embed) 

encoder_input = Concatenate(name='ENCODER_INPUT')([pre_embed, tr_embed_to_encoder])

# ENCODING CONTEXT

encoder = Bidirectional(
    layer=LSTM(RNN_STATE_SIZE),
    name='ENCODER'
)(encoder_input)

# MERGING ENCODED CONTEXT WITH TARGET EMBEDDINGS
# SHAPE: [batch_size x RNN_STATE_SIZE * 2 + TR_EMBED_SIZE]

comparator = Concatenate(name='COMPARATOR')([encoder, tr_embed])

# OUTPUT PREDICTIONS

dense = Dense(
    units=VOCAB_SIZE,
    activation='softmax',
    name='DENSE_OUT'
)(comparator)

In [190]:
model = Model(inputs=[input_context, input_labels], outputs=dense)

In [191]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [175]:
pre_embed.shape

TensorShape([Dimension(None), Dimension(2), Dimension(8)])

In [163]:
model.fit(x=[CONTEXTS_DATA[:100], LABELS_DATA[:100]], y=LABELS_DATA[:100], epochs=10, batch_size=8, verbose=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f30cf116eb8>

In [160]:
r = model.predict([CONTEXTS_DATA[:5], LABELS_DATA[:5]])

In [161]:
r

array([[3.2178265e-05, 3.1872823e-05, 3.1878761e-05, ..., 3.2254604e-05,
        3.2435961e-05, 3.2233638e-05],
       [3.1995096e-05, 3.1848977e-05, 3.2101099e-05, ..., 3.2322718e-05,
        3.2207776e-05, 3.2370219e-05],
       [3.2093802e-05, 3.2129152e-05, 3.1940173e-05, ..., 3.2114731e-05,
        3.2211134e-05, 3.1697680e-05],
       [3.1949268e-05, 3.1947879e-05, 3.1997937e-05, ..., 3.2049498e-05,
        3.2346343e-05, 3.2284624e-05],
       [3.2119133e-05, 3.2424490e-05, 3.1950032e-05, ..., 3.1951193e-05,
        3.2050219e-05, 3.1822066e-05]], dtype=float32)

In [192]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
INPUT_LABELS (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
TRAINABLE_EMBEDDINGS (Embedding (None, 1, 8)         249600      INPUT_LABELS[0][0]               
__________________________________________________________________________________________________
INPUT_CONTEXTS (InputLayer)     (None, 2)            0                                            
__________________________________________________________________________________________________
COLLAPSED_TRAINABLE_EMBEDDINGS  (None, 8)            0           TRAINABLE_EMBEDDINGS[0][0]       
__________________________________________________________________________________________________
PRE_EMBEDD

In [101]:
tf.reset_default_graph()
tf_sess.close()

In [102]:
# window_left + window_right
tf_vocab_size = 9
tf_fitting_embed_size = 8
tf_batch_size = 1
tf_window_size = 2
tf_rnn_state_size = 16
tf_embedding_dense_size = 32
tf_pre_embedding = tf.constant(pre_embedding, dtype=tf.float32)

tf_input_context = tf.placeholder(dtype=tf.int32, shape=(tf_batch_size, tf_window_size))
tf_input_labels = tf.placeholder(dtype=tf.int32, shape=(tf_batch_size,))

tf_fitting_embedding = tf.Variable(tf.truncated_normal(shape=(tf_vocab_size, tf_fitting_embed_size), stddev=0.1))
tf_W_out = tf.Variable(tf.truncated_normal(shape=(tf_rnn_state_size * 2 + tf_fitting_embed_size, tf_vocab_size), stddev=0.1))
tf_b_out = tf.Variable(tf.zeros(shape=(tf_vocab_size,)))


# EMBEDDING TENSOR [batch_size x window_size x pre_embed_size]

with tf.device("/cpu:0"):
    tf_context_pre_embed = tf.nn.embedding_lookup(tf_pre_embedding, tf_input_context)

tf_rnn_cell_fw = tf.nn.rnn_cell.LSTMCell(16)
tf_rnn_cell_bw = tf.nn.rnn_cell.LSTMCell(16)
tf_rnn_state = tf_rnn_cell.zero_state(tf_batch_size, dtype=tf.float32)

tf_rnn_outputs, tf_rnn_states = tf.nn.bidirectional_dynamic_rnn(
    dtype=tf.float32,
    cell_fw=tf_rnn_cell_fw,
    cell_bw=tf_rnn_cell_bw,
    inputs=tf_context_pre_embed
)

with tf.device("/cpu:0"):
    tf_context_fitting_embed = tf.nn.embedding_lookup(tf_fitting_embedding, tf_input_labels)

# CONCAT FW AND BW HIDDEN STATES AND FITTING EMBED OF TARGETS [batch_size x HIDDEN_STATE_SIZE * 2 + FITTING_EMBED_SIZE]

tf_final_state = tf.concat(
    [tf_rnn_states[0].c, tf_rnn_states[1].c, tf_context_fitting_embed],
    axis=1
)

tf_logits = tf.matmul(tf_final_state, tf_W_out) + tf_b_out

tf_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=tf_input_labels,
    logits=tf_logits
)

In [103]:
tf_init_op = tf.global_variables_initializer()
tf_sess = tf.Session()
tf_sess.run(tf_init_op)

In [104]:
r = tf_sess.run(
        tf_final_state,
        {tf_input_context: np.array([[0, 2]]), tf_input_labels: np.array([1])}
)

In [105]:
r.shape

(1, 40)

In [40]:
r.c

array([[ 0.05075238, -0.2744412 , -0.23570031, -0.01703531, -0.00418064,
         0.34944135,  0.30708185, -0.15840204,  0.632811  , -0.20224315,
         0.20672935, -0.3125187 ,  0.16155767,  0.38695666, -0.1943923 ,
         0.20426655]], dtype=float32)

In [47]:
len(embeddings)

9

In [72]:
tf.concat(
    [tf.constant(np.array([[1]])), tf.constant(np.array([[2]]))], 1
).eval(session=tf_sess)

array([[1, 2]])