# 1. READ DATA

In [1]:
import codecs
import tensorflow as tf
import numpy as np 

In [2]:
''' 
    1. Read from 'movie-lines.txt'
    2. Create a dictionary with ( key = line_id, value = text )
'''
def get_id2line():
    with codecs.open('./updated_movie_lines.txt', 
                     'r', encoding='utf-8', errors='ignore') as fdata:
        lines= fdata.read().split('\n')
    id2line = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            id2line[_line[0]] = _line[4]
        else:
            id2line[_line[0]] = ' '
    return id2line

In [3]:
'''
    1. Read from 'movie_titles_metadata.txt'
    2. Create a dictionary with ( key = movie_id, value = [genre1, genre2, ..])
'''
def get_id2genre():
    with codecs.open('./corpus/movie_titles_metadata.txt', 
                     'r', encoding='utf-8', errors='ignore') as fdata:
        lines= fdata.read().split('\n')
        
    id2genre = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 6:
            id2genre[_line[0]] = _line[-1][1:-1].replace("'","")
    return id2genre        


In [4]:
'''
    1. Read from 'movie_conversations.txt'
    2. Create a list of [list of line_id's]
    3. Create a list of corresponding movie_id
'''
def get_conversations_with_movie_id():
    conv_lines = open('./corpus/movie_conversations.txt').read().split('\n')
    convs = [ ]
    movie_id = []
    for line in conv_lines[:-1]:
        _line = line.split(' +++$+++ ')[-1][1:-1].replace("\\'", "'").replace("'","").replace(" ","")
        convs.append(_line.split(','))
        _line = line.split(' +++$+++ ')[2]
        movie_id.append(_line)
    return convs, movie_id


In [5]:
'''
    MODIFIED!

    A1: blah
    B1: blah
    A2: blah
    B2: blah 

    are two pairs, but really two data samples for encoder and decoder 
    (Context: A1, response: B1), (Context: A1, B1, A2, response: B2). 
'''
def gather_dataset_with_genres(convs, id2line, movie_id, id2genre, stride=2):
    contexts = []
    responses = []
    genres = []
    for conv, mid in zip(convs, movie_id):
        # in each conversation
        # 1, 2, ... i-1 lines are the context
        # ith utterance will be the response
        i = 1
        while i < len(conv):
            context = ''
            for j in range(i):
                context += ' ' + id2line[conv[j]]
            contexts.append(context)    
            responses.append(id2line[conv[i]])
            genres.append(id2genre[mid])    
            i += stride
        
    return contexts, responses, genres

In [6]:
def read_cornell():
    convs, movie_id = get_conversations_with_movie_id()
    id2line = get_id2line()
    id2genre = get_id2genre()
    c, r, g = gather_dataset_with_genres(convs, id2line, movie_id, id2genre)
    
    return c, r, g


In [7]:
contexts, responses, genres = read_cornell()

In [8]:
def write2file(fname, data):
    with open(fname, 'w') as f:
        for d in data:
            f.write(d + '\n')

write2file('context.txt', contexts)            
write2file('response.txt', responses)
write2file('genre.txt', genres)

# 2. TOKENIZATION

simply separating by space, and lowering the string

In [9]:
import re
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    global line_counter
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data(data_fname):
    """
    Decoder data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    examples = list(open(data_fname, "r").readlines())
    
    examples = [s.strip() for s in examples]
    
    # Split by words
    tokenized_text = [clean_str(sent) for sent in examples]
    
    return tokenized_text

In [10]:
tokenized_context = load_data('context.txt')

In [11]:

import matplotlib.pyplot as plt

%matplotlib inline

In [12]:
MIN_LINE_LEN = 3
MAX_LINE_LEN = 50

lens = [len(line.split()) for line in tokenized_context]
normlen_context = []
normlen_genres = []
lens = [len(line.split()) for line in tokenized_context]
for i in range(len(tokenized_context)):
    if lens[i] > MIN_LINE_LEN and lens[i] < MAX_LINE_LEN:
        normlen_context.append(tokenized_context[i])
        normlen_genres.append(genres[i])

In [13]:
len(normlen_context), len(tokenized_context)

(98660, 138135)

# 3. VOCABULARIZE

In [14]:

learn = tf.contrib.learn
def vocabularize(text):
    max_document_length = max([len(x.split(" ")) for x in text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, min_frequency=3)
    x = np.array(list(vocab_processor.fit_transform(text)))
    
    return x, vocab_processor

In [15]:
cont_id, cont_vocab = vocabularize(normlen_context)
genr_id, genr_vocab = vocabularize(normlen_genres)

# 4. CREATE GENRE LABELS

In [16]:
genr_labels = np.zeros((len(genr_id), genr_id.max()+1), dtype=bool)
for i, gid in enumerate(genr_id):
    # first row is UNK token
    genr_labels[i, gid] = True

genr_labels = genr_labels[:, 1:]

In [18]:
genr_id[0], genr_labels[0]

(array([3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([False, False,  True,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False], dtype=bool))

# 5. SAVE DATA

In [19]:
cont_vocab.save('context.vocab')
np.save('context', cont_id)
np.save('genres', genr_labels)

In [53]:
cont_id = np.load('context.npy')
genr_labels = np.load('genres.npy')

In [67]:
"""
    Origin:
    https://github.com/dennybritz/tf-rnn/blob/master/sequence_example.ipynb
"""

def make_example(sequence, label):
    # The object we return
    ex = tf.train.SequenceExample()
    # A non-sequential feature of our example
    sequence_length = len(sequence)
    ex.context.feature['length'].int64_list.value.append(sequence_length)
    ex.context.feature['labels'].int64_list.value.extend(label)
    # This part of TF is not so verbose
    # and tutorials are rare, also serialized labels were serialized with different length

    # Reshaped a bit WildML-s tips and tricks
    # http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/
    '''
    ex.feature_lists\
        .feature_list['tokens']\
        .feature.add()\
        .int64_list.value.extend(sequence)
    '''
    fl_tokens = ex.feature_lists.feature_list["tokens"]
    for token in sequence:
        fl_tokens.feature.add().int64_list.value.append(token)
    
    return ex

def write_TFRecord(fname, sequences, labels):
    with open(fname + '.TFRecord', 'w') as fp:
        writer = tf.python_io.TFRecordWriter(fp.name)
        print('Sampling...')
        i = 0
        for sequence, label in zip(sequences, labels):
            
            ex = make_example(sequence, label)
            writer.write(ex.SerializeToString())
            
            if i%500 == 0: print('\r%d'%i, end='')
            i+=1
        writer.close()
        print("\nWrote to {}".format(fp.name))        

In [32]:
"""
    Origin:
    https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/
"""

def make_example(sequence, label):
    # The object we return
    
    ex = tf.train.Example(features=tf.train.Features(
    feature={
        'labels':tf.train.Feature(
            int64_list=tf.train.Int64List(value=label)),
        'tokens':tf.train.Feature(
            int64_list=tf.train.Int64List(value=sequence))
    }))
    return ex

def write_TFRecord(fname, sequences, labels):
    with open(fname + '.TFRecord', 'w') as fp:
        writer = tf.python_io.TFRecordWriter(fp.name)
        print('Sampling...')
        i = 0
        for sequence, label in zip(sequences, labels):
            
            ex = make_example(sequence, label)
            writer.write(ex.SerializeToString())
            
            if i%500 == 0: print('\r%d'%i, end='')
            i+=1
        writer.close()
        print("\nWrote to {}".format(fp.name))        

In [55]:
cont_list = len(cont_id) * [None]
for i in range(len(cont_id)):
    cont_list[i] = np.trim_zeros(cont_id[i])
    if i%500 == 0: print('\r%d'%i, end='')
#cont_list

98500

In [70]:

write_TFRecord('cnn', cont_list, genr_labels)

Sampling...
98500
Wrote to cnn.TFRecord


In [30]:
ex = tf.train.Example(features=tf.train.Features(
    feature={
        'labels':tf.train.Feature(
            int64_list=tf.train.Int64List(value=[0, 1, 2, 1, 0])),
        'tokens':tf.train.Feature(
            int64_list=tf.train.Int64List(value=[1, 2, 3, 5,])),
    }))
ex.ListFields()

[(<google.protobuf.pyext._message.FieldDescriptor at 0x7f5b500d7a90>, feature {
    key: "labels"
    value {
      int64_list {
        value: 0
        value: 1
        value: 2
        value: 1
        value: 0
      }
    }
  }
  feature {
    key: "tokens"
    value {
      int64_list {
        value: 1
        value: 2
        value: 3
        value: 5
      }
    }
  })]

Understanding protobufs:

http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/

https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/

At some point serialization is required, but in the example I used it isn't, but after the example is made the whole example is searialized to string.
Very poorly documented, and unstable.
Reading out in automated batch is described in WildML however, when
[[1, 2, 3], [4, 5, 6, 7]] is fed as input it does not work as expected.

However when reading with queues (which should be **built** in graph before being **initialized**, **coordinated** to end the reading session gracefully), the `dequeue` operator cooperates well with the `tf.train.batch` function




In [129]:
ex = tf.train.SequenceExample()
print(ex.ListFields())
ex.context.feature['length'].int64_list.value.extend([1, 2, 3])
# THIS IS STILL SOME BLACK MAGIC THOUGH...
ex.context.feature['length'].int64_list.value
ex.feature_lists.feature_list['tokens'].feature.add().int64_list.value.extend([9, 9, 9, 9])
print(ex.ListFields())

[]
[(<google.protobuf.pyext._message.FieldDescriptor object at 0x7fd1f3f65790>, feature {
  key: "length"
  value {
    int64_list {
      value: 1
      value: 2
      value: 3
    }
  }
}
), (<google.protobuf.pyext._message.FieldDescriptor object at 0x7fd1f3f65190>, feature_list {
  key: "tokens"
  value {
    feature {
      int64_list {
        value: 9
        value: 9
        value: 9
        value: 9
      }
    }
  }
}
)]


In [114]:
np.trim_zeros(cont_id)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [115]:
len(genr_labels), len(cont_list)

(98660, 98660)

In [116]:
genr_labels

array([[False, False,  True, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       ..., 
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False]], dtype=bool)

In [31]:
write_TFRecord('cnn', cont_list, genr_labels)

Sampling...
0features {
  feature {
    key: "labels"
    value {
      int64_list {
        value: 0
        value: 0
        value: 1
        value: 1
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
      }
    }
  }
  feature {
    key: "tokens"
    value {
      int64_list {
        value: 53
        value: 17
        value: 112
        value: 18
        value: 915
        value: 0
        value: 0
        value: 14
        value: 3559
        value: 5763
        value: 21
        value: 366
        value: 87
        value: 3751
        value: 0
        value: 1055
        value: 451
        value: 51
        value: 29
        value: 3
        value: 0
        value: 164
      }
    }
  }
}

500

In [20]:
genr_labels.astype(float).dtype

dtype('float64')

In [33]:
genr_id.shape

(98660, 11)

# Reader

In [44]:
import codecs
import tensorflow as tf
import numpy as np 

In [75]:
def parse_example(filename_queue):
    # Define how to parse the example
    
    reader = tf.TFRecordReader()
    _, example = reader.read(filename_queue)
    
    context_features = {
        'length': tf.FixedLenFeature([1], dtype=tf.int64),
        'labels': tf.FixedLenFeature([24], dtype=tf.int64)
    }
    sequence_features = {
        "tokens": tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }
    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=example,
        context_features=context_features,
        sequence_features=sequence_features
    )
    return context_parsed, sequence_parsed
    

In [76]:
filename_queue = tf.train.string_input_producer(['cnn.TFRecord'])
cont, seq = parse_example(filename_queue)

with tf.Session() as sess:
    
    coord = tf.train.Coordinator()
    
    sess.run(tf.global_variables_initializer())
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    for i in range(99000):
        if i%10000==0:print(i)
        res = cont['labels'].eval()
        if len(res) != 24:
            print(res)
            break
        #print(res)
    coord.request_stop()
    coord.join(threads)
    sess.close()

0
INFO:tensorflow:Error reported to Coordinator: <class 'RuntimeError'>, Attempted to use a closed Session.


InvalidArgumentError: Name: , Key: labels, Index: 0.  Number of int64 values != expected.  values size: 24 but output shape: []
	 [[Node: ParseSingleSequenceExample_22/ParseSingleSequenceExample = ParseSingleSequenceExample[Ncontext_dense=2, Ncontext_sparse=0, Nfeature_list_dense=1, Nfeature_list_sparse=0, Tcontext_dense=[DT_INT64, DT_INT64], context_dense_shapes=[[], []], context_sparse_types=[], feature_list_dense_shapes=[[]], feature_list_dense_types=[DT_INT64], feature_list_sparse_types=[], _device="/job:localhost/replica:0/task:0/cpu:0"](ReaderRead_27:1, ParseSingleSequenceExample_22/ParseSingleSequenceExample/feature_list_dense_missing_assumed_empty, ParseSingleSequenceExample_22/ParseSingleSequenceExample/context_dense_keys_0, ParseSingleSequenceExample_22/ParseSingleSequenceExample/context_dense_keys_1, ParseSingleSequenceExample_22/ParseSingleSequenceExample/feature_list_dense_keys_0, ParseSingleSequenceExample_22/Const, ParseSingleSequenceExample_22/Const_1, ParseSingleSequenceExample_22/ParseSingleSequenceExample/debug_name)]]

Caused by op 'ParseSingleSequenceExample_22/ParseSingleSequenceExample', defined at:
  File "/usr/lib64/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib64/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/usr/lib64/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/lib64/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/usr/lib64/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib64/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/lib64/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/lib64/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/lib64/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/usr/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-76-c7aaac5d9a13>", line 2, in <module>
    cont, seq = parse_example(filename_queue)
  File "<ipython-input-75-e1623355f07f>", line 17, in parse_example
    sequence_features=sequence_features
  File "/usr/lib/python3.5/site-packages/tensorflow/python/ops/parsing_ops.py", line 636, in parse_single_sequence_example
    feature_list_dense_defaults, example_name, name)
  File "/usr/lib/python3.5/site-packages/tensorflow/python/ops/parsing_ops.py", line 833, in _parse_single_sequence_example_raw
    name=name)
  File "/usr/lib/python3.5/site-packages/tensorflow/python/ops/gen_parsing_ops.py", line 287, in _parse_single_sequence_example
    name=name)
  File "/usr/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
    op_def=op_def)
  File "/usr/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Name: , Key: labels, Index: 0.  Number of int64 values != expected.  values size: 24 but output shape: []
	 [[Node: ParseSingleSequenceExample_22/ParseSingleSequenceExample = ParseSingleSequenceExample[Ncontext_dense=2, Ncontext_sparse=0, Nfeature_list_dense=1, Nfeature_list_sparse=0, Tcontext_dense=[DT_INT64, DT_INT64], context_dense_shapes=[[], []], context_sparse_types=[], feature_list_dense_shapes=[[]], feature_list_dense_types=[DT_INT64], feature_list_sparse_types=[], _device="/job:localhost/replica:0/task:0/cpu:0"](ReaderRead_27:1, ParseSingleSequenceExample_22/ParseSingleSequenceExample/feature_list_dense_missing_assumed_empty, ParseSingleSequenceExample_22/ParseSingleSequenceExample/context_dense_keys_0, ParseSingleSequenceExample_22/ParseSingleSequenceExample/context_dense_keys_1, ParseSingleSequenceExample_22/ParseSingleSequenceExample/feature_list_dense_keys_0, ParseSingleSequenceExample_22/Const, ParseSingleSequenceExample_22/Const_1, ParseSingleSequenceExample_22/ParseSingleSequenceExample/debug_name)]]


In [53]:
def parse_example(filename_queue):
    # Define how to parse the example
    
    reader = tf.TFRecordReader()
    _, ex = reader.read(filename_queue)
    
    features = {
        'tokens': tf.FixedLenFeature([], dtype=tf.int64),
        'labels': tf.FixedLenFeature([24], dtype=tf.int64)
    }
    
    parsed = tf.parse_single_example(
        serialized=ex,
        features=features
    )
    
    tokens = parsed['tokens']
    labels = parsed['labels']
    
    return tokens, labels
    

In [34]:
filename_queue = tf.train.string_input_producer(['cnn.TFRecord'])
parsed = parse_example(filename_queue)
parsed

({'labels': <tf.Tensor 'ParseSingleSequenceExample_10/ParseSingleSequenceExample:0' shape=(24,) dtype=int64>,
  'length': <tf.Tensor 'ParseSingleSequenceExample_10/ParseSingleSequenceExample:1' shape=(1,) dtype=int64>},
 {'tokens': <tf.Tensor 'ParseSingleSequenceExample_10/ParseSingleSequenceExample:2' shape=(?, 1) dtype=int64>})

Reading a single SeqExample

In [35]:
seq

{'tokens': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x7f7a40345a90>}

In [39]:
filename_queue = tf.train.string_input_producer(['cnn.TFRecord'])
cont, seq = parse_example(filename_queue)

with tf.Session() as sess:
    
    coord = tf.train.Coordinator()
    
    sess.run(tf.global_variables_initializer())
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    for i in range(99000):
        if i%10000==0:print(i)
        res = cont['labels'].eval()
        if len(res) != 24:
            print(res)
            break
        #print(res)
    coord.request_stop()
    coord.join(threads)
    sess.close()

ValueError: Dimension -1 must be >= 0

This is driving me crazy

```
Name: , Key: tokens, Index: 0.  Number of int64 values != expected.  values size: 22 but output shape: []
```

however triple checked, that every single line in genre_labels are 24 length
still serialized to 22...

How to read batched SeqExamples

In [6]:
filename_queue = tf.train.string_input_producer(['cnn.TFRecord'])
sequence_parsed = parse_example(filename_queue)
batched_data = tf.train.batch(
        tensors=sequence_parsed,
        batch_size=2,
        dynamic_pad=True
    )

x = batched_data['tokens']
y = batched_data['labels']
res = []
with tf.Session() as sess:
    
    coord = tf.train.Coordinator()
    sess.run(tf.global_variables_initializer())
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try:
        while not coord.should_stop():
            #pass
            res.append(x.eval().shape[1])
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()
    #res = sess.run(sequence_parsed['tokens'])
    #res = sess.run(sequence_parsed['tokens'])
    #res = batched_data.eval()
    #res1 = tf.contrib.learn.run_n({'y':batched_data})
    #res2 = tf.contrib.learn.run_n({'y':batched_data}, n=6)
    #print(res[0]['y'])
    coord.join(threads)
    sess.close()

SystemError: <built-in function TF_NewBuffer> returned a result with an error set

In [8]:
len(res)

14644

In [53]:
for run in res2:
    for line in run['y']:
        print(len(line))

22
22
13
13
10
10
7
7
10
10
3
3


In [46]:
print(res)
print(res1)
print(res2)

[  52   13  120   16  798    0    0   19 2999    0   18  349   95 3811    0
 1519  455   56   30    3    0  159]
[{'y': array([[  52,   13,  120,   16,  798,    0,    0,   19, 2999,    0,   18,
         349,   95, 3811,    0, 1519,  455,   56,   30,    3,    0,  159]])}]
[{'y': array([[  52,   13,  120,   16,  798,    0,    0,   19, 2999,    0,   18,
         349,   95, 3811,    0, 1519,  455,   56,   30,    3,    0,  159]])}]


In [32]:
res2

[{'y': array([[  52,   13,  120,   16,  798,    0,    0,   19, 2999,    0,   18,
           349,   95, 3811,    0, 1519,  455,   56,   30,    3,    0,  159]])}]

In [62]:
type(cont_id[:10].tolist()[0][0])

int

In [8]:
filename = "cnn.TFRecord"
for serialized_example in tf.python_io.tf_record_iterator(filename):
    example = serialized_example
    
    context_features = {
        "length" : tf.FixedLenFeature([1], dtype=tf.int64),
    }
    sequence_features = {
        "tokens": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        "labels": tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }
    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=example,
        context_features=context_features,
        sequence_features=sequence_features
    )
    
    # traverse the Example format to get data
    print(context_parsed, sequence_parsed)
    break

{'length': <tf.Tensor 'ParseSingleSequenceExample_2982/ParseSingleSequenceExample:0' shape=(1,) dtype=int64>} {'tokens': <tf.Tensor 'ParseSingleSequenceExample_2982/ParseSingleSequenceExample:2' shape=(?,) dtype=int64>, 'labels': <tf.Tensor 'ParseSingleSequenceExample_2982/ParseSingleSequenceExample:1' shape=(?,) dtype=int64>}


Sampling...
20000
40000
60000
80000
100000
120000
Wrote to encoder-decoder.TFRecord


Writing ./stride1-qa-16384.tfrecords


In [25]:
for q, a, g in zip(questions[1000:1010], answers[1000:1010], genres[1000:1010]):
    print(q, '<<--- ', g, '--->>', a, end='\n\n\n')

Why should I carry your bag?  I am not a dog. <<---  ['action', 'crime', 'drama', 'thriller'] --->> For five years I paid for your stupidness - you'll carry my bag for the rest of my life if I say so.  Unless you refuse, Oleg.


Turn that off!  Get the bags. <<---  ['action', 'crime', 'drama', 'thriller'] --->> Why should I carry your bag?  I am not a dog.


What? <<---  ['action', 'crime', 'drama', 'thriller'] --->> Smell like chemicals...for smoking drugs.


Turn that fucking thing off! <<---  ['action', 'crime', 'drama', 'thriller'] --->> I'm not filming.  I'm watching Milos die.  It's just like a move but realer.


You said speak Czech! <<---  ['action', 'crime', 'drama', 'thriller'] --->> How you erase this?


Speak English! <<---  ['action', 'crime', 'drama', 'thriller'] --->> You said speak Czech!


How you erase this? <<---  ['action', 'crime', 'drama', 'thriller'] --->> I'll do it.  Don't hurt my camera!


Whore? <<---  ['action', 'crime', 'drama', 'thriller'] --->> I'm homesi