In [1]:
# boilerplate
import numpy as np
import tensorflow as tf

sess = tf.InteractiveSession()
import tensorflow_fold as td

# Character level text analysis

In [2]:
def chr_enum(c):
    """map all common characters to [1...94] any other to 0"""
    enum = ord(c) - ord(' ')
    return enum if 0 < enum <= 94 else 0

In [3]:
[chr_enum(c) for c in 'word']

[87, 79, 82, 68]

# Define Hierachical LSTM NNet

## CharsLSTM word string -> word vector

In [4]:
CHR_EMBED_SIZE = 8
WORD_EMBED_SIZE = 4  # 64
SENTENCE_EMBED_SIZE = 8  # 128

In [5]:
# Create RNN cells using the standard TensorFlow implementation
char_cell = td.ScopedLayer(tf.contrib.rnn.BasicLSTMCell(num_units=WORD_EMBED_SIZE), 'char_cell')

chars_lstm = (
    td.InputTransform(lambda word: [chr_enum(c) for c in word]) >>
    td.Map(
        td.Scalar('int32') >>
        td.Embedding(95, CHR_EMBED_SIZE, mod_inputs=False)
    ) >>
    td.RNN(char_cell)
)

In [6]:
state_sequence, (last_cell, last_state) = chars_lstm.eval('word')

In [7]:
state_sequence

[array([-0.01409699,  0.01019543, -0.0115759 ,  0.03520417], dtype=float32),
 array([-0.00829704,  0.01285476,  0.0118047 ,  0.01146564], dtype=float32),
 array([-0.01372081, -0.00901716,  0.00088371,  0.01629999], dtype=float32),
 array([ -1.19660972e-05,  -4.45432309e-03,  -1.42430533e-02,
          9.92753170e-03], dtype=float32)]

In [8]:
(last_cell, last_state)

(array([ -2.32513994e-05,  -8.63248948e-03,  -2.84103211e-02,
          1.92340650e-02], dtype=float32),
 array([ -1.19660972e-05,  -4.45432309e-03,  -1.42430533e-02,
          9.92753170e-03], dtype=float32))

In [3]:
def GetLastState():
    """Composition of blocks that gets last state vector from LSTM output"""
    return td.GetItem(1) >> td.GetItem(1)

In [10]:
word_vector = chars_lstm >> GetLastState()

In [11]:
word_vector.eval('test-word')

array([ 0.00974098,  0.00174401, -0.00218897, -0.01141942], dtype=float32)

## WordsLSTM sequence of word vectors -> sentence vector

In [12]:
# Create RNN cells using the standard TensorFlow implementation
word_cell = td.ScopedLayer(tf.contrib.rnn.BasicLSTMCell(num_units=SENTENCE_EMBED_SIZE), 'word_cell')

# word LSTM converts a sequence of word vectors to a sentence vector.
words_lstm = (
    td.InputTransform(lambda text: text.split(' ')) >>
    td.Map(word_vector) >> 
    td.RNN(word_cell)
)

In [13]:
sentence_vector = words_lstm >> GetLastState()

In [14]:
sentence_vector.eval('try this one!')

array([-0.0004182 ,  0.00319472, -0.00135735,  0.00019724,  0.00037043,
        0.00182619, -0.00315728, -0.00205725], dtype=float32)

# Let's try this model for sentiment analysis on the IMDB Dataset

In [15]:
text_to_logits = sentence_vector >> td.FC(1, activation=None)

In [16]:
text_to_logits.eval('Another sentence.')

array([ 0.00207674], dtype=float32)

## Define cost and train op

In [17]:
target = td.Vector(1)

In [18]:
target.eval([0])

array([ 0.], dtype=float32)

In [19]:
compiler = td.Compiler.create((text_to_logits, target))
[logits, labels] = compiler.output_tensors
loss = tf.losses.sigmoid_cross_entropy(labels, logits)
train_op = tf.train.AdamOptimizer().minimize(loss)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


## Get some IMDB data to play with

In [20]:
from tffold_additions.imdb.dataloader import IMDBDataLoader

In [21]:
train_dl = IMDBDataLoader('/home/andershuss/Data/aclImdb/train/')

In [22]:
epoch = train_dl.get_epoch_iterator(batch_size=16)

In [23]:
input_target_batch = next(epoch)

In [24]:
input_, label = input_target_batch[0]

In [25]:
(input_, label)

('I am an avid fan of Lucio Fulci, and yet I must say that "Zombi 3" (aka. "Zombie Flesh Eaters 2") of 1988, which he made with two other directors, Bruno Mattei and Claudi Fragasso, was quite a disappointment. Especially compared to its great predecessor, Fulci\'s very own Gore classic "Zombi 2" (aka. "Zombie Felsh Eaters"/"Zombie") of 1979, this is vastly disappointing. Sure, the low rating of 4.5 already suggests that it\'s not a good film, but, these low ratings usually come from people who are not into Italian Zombie flicks, and as enthusiastic fan of Italian Horror films and low-budget Exploitation cinema, I love many films that have only been rewarded with much lower ratings. Also, many of my fellow Italian Horror buffs seem to think of this film as underrated, which I sadly cannot agree with. Not that the film was a complete disaster. It has some redeeming qualities, above all Fulci\'s nauseating gore effects, that are always a pleasure to watch for an Italian Horror/Gore buff.

In [26]:
text_to_logits.eval(input_)

array([-0.00593366], dtype=float32)

In [27]:
sess.run(logits, compiler.build_feed_dict(input_target_batch[:1]))

array([[-0.00593366]], dtype=float32)

## Train Loop

In [29]:
n_epochs = 10
batch_size = 8
print_every_n_batch = 10

for e in range(1, n_epochs + 1):
    epoch = train_dl.get_epoch_iterator(batch_size=batch_size)
    batch_losses = []
    for b, batch in enumerate(epoch, 1):
        [_, batch_loss] = sess.run([train_op, loss], compiler.build_feed_dict(batch))
        batch_losses.append(batch_loss)
        if (b % print_every_n_batch) == 0: 
            print(
                'epoch: {}, batch: {}, loss: {}'.format(
                    e, b, sum(batch_losses[-print_every_n_batch:]) / print_every_n_batch
                )
            )


epoch: 1, batch: 10, loss: 0.692231541872
epoch: 1, batch: 20, loss: 0.694338995218
epoch: 1, batch: 30, loss: 0.693742513657


KeyboardInterrupt: 

# Size it up and test it...

In [6]:
#...See IMBDHierachicalLSTMBigger notebook