In [1]:
# get the questions and answers data and store it in array
import sqlite3
import pandas as pd

timeframes = ['2015-01']

for timeframe in timeframes:
    connection = sqlite3.connect('{}.db'.format(timeframe))
    c = connection.cursor()
    limit = 5000
    last_unix = 0
    cur_length = limit
    counter = 0
    questions = []
    answers = []
    
    while cur_length == limit:
        df = pd.read_sql("SELECT * FROM parent_reply WHERE unix > {} AND parent NOT NULL AND score > 0 ORDER BY unix ASC LIMIT {}".format(last_unix, limit), connection)
        last_unix = df.tail(1)['unix'].values[0]
        cur_length = len(df)
        
        for content in df['parent'].values:
            questions.append(content)
            
        for content in df['comment'].values:
            answers.append(content)
            
        counter += 1
        if counter % 2 == 0:
            print(counter * limit, 'rows completed so far')
            
    print('all rows completed')

all rows completed


In [2]:
print(len(questions), len(answers))

4182 4182


In [3]:
vocab = {}

for question in questions:
    words = question.split()
    for word in words:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
for answer in answers:
    words = question.split()
    for word in words:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

In [4]:
count = 0
for key, value in vocab.items():
    if value >= 2:
        count += 1
        
print(len(vocab), count)

10864 3832


In [5]:
questions_vocab_to_int, answers_vocab_to_int = {}, {}
word_num = 0
threshold = 2

for word, count in vocab.items():
    if count >= threshold:
        questions_vocab_to_int[word] = word_num
        word_num += 1
    
word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        answers_vocab_to_int[word] = word_num
        word_num += 1

In [6]:
# Add the unique tokens to the vocabulary dictionaries.
codes = ['<PAD>', '<EOS>', '<UNK>', '<GO>']

for code in codes:
    questions_vocab_to_int[code] = len(questions_vocab_to_int) + 1
    
for code in codes:
    answers_vocab_to_int[code] = len(answers_vocab_to_int) + 1

In [7]:
# create dictionaries to map the unique integers to their respective words.
questions_int_to_vocab = {v_i: v for v, v_i in questions_vocab_to_int.items()}
answers_int_to_vocab = {v_i: v for v, v_i in answers_vocab_to_int.items()}

In [8]:
# add the end of sequence token to the end of every answer.
for i in range(len(answers)):
    answers[i] += '<EOS>'

In [9]:
# convert the text to integers
# Replace any words that are not in the respective vocabulary with unknown
questions_int = []

for question in questions:
    ints = []
    for word in question.split():
        if word not in questions_vocab_to_int:
            ints.append(questions_vocab_to_int['<UNK>'])
        else:
            ints.append(questions_vocab_to_int[word])
    questions_int.append(ints)

In [10]:
answers_int = []
for answer in answers:
    ints = []
    for word in answer.split():
        if word not in answers_vocab_to_int:
            ints.append(answers_vocab_to_int['<UNK>'])
        else:
            ints.append(answers_vocab_to_int[word])
    answers_int.append(ints)

In [11]:
print(len(questions_int), len(answers_int))

4182 4182


In [12]:
# calculating the percentage of words replaced by <UNK>
word_count = 0
unk_count = 0

for question in questions_int:
    for word in question:
        if word == questions_vocab_to_int['<UNK>']:
            unk_count += 1
        word_count += 1
        
for answer in answers_int:
    for word in answer:
        if word == answers_vocab_to_int['<UNK>']:
            unk_count += 1
        word_count += 1
        
unk_ratio = round(unk_count/word_count, 4) * 100

In [13]:
print('Total number of words:', word_count)
print('Number of times <UNK> is used:', unk_count)
print('Percent of words that are <UNK>: {}%'.format(round(unk_ratio, 3)))

Total number of words: 126572
Number of times <UNK> is used: 17613
Percent of words that are <UNK>: 13.92%


In [14]:
# get the max word in questions or answers

max_count = 0

for question in questions:
    if len(question.split()) > max_count:
        max_count = len(question.split())
        
for answer in answers:
    if len(answer.split()) > max_count:
        max_count = len(answer.split())
        
print(max_count)
    

50


In [15]:
# sort questions and answers by the length of questions to reduce amount of padding during training

sorted_questions = []
sorted_answers = []

for length in range(1, max_count + 1):
    for i in enumerate(questions_int):
        if len(i[1]) == length:
            sorted_questions.append(questions_int[i[0]])
            sorted_answers.append(answers_int[i[0]])

In [16]:
for i in range(5):
    print(sorted_questions[i])
    print(sorted_answers[i])
    print()

[49]
[3835]

[361]
[3835]

[3835]
[3835]

[566]
[366, 33, 3835, 970, 4, 3835]

[3835]
[3835]



In [17]:
def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.int32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return input_data, targets, lr, keep_prob

In [18]:
def process_encoding_input(target_data, vocab_to_int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    
    return dec_input

In [19]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    enc_cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    _, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = enc_cell, cell_bw = enc_cell, sequence_length = sequence_length, inputs = rnn_inputs, dtype = tf.float32)
    
    return enc_state

In [20]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, decoding_scope, output_fn, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, dec_cell.output_size])
    
    att_keys, att_vals, att_score_fn, att_construct_fn = tf.contrib.seq2seq.prepare_attention(attention_states,
                                                                                             attention_option = 'bahdanau',
                                                                                             num_units = dec_cell.output_size)
    
    train_decoder_fn = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                    att_keys,
                                                                    att_vals,
                                                                    att_score_fn,
                                                                    att_construct_fn,
                                                                    name = 'attn_dec_train')
    
    train_pred, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell,
                                                             train_decoder_fn,
                                                             dec_embed_input,
                                                             sequence_length,
                                                             scope = decoding_scope)
    
    train_pred_drop = tf.nn.dropout(train_pred, keep_prob)
    return output_fn(train_pred_drop)

In [21]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, end_of_sequence_id, maximum_length, vocab_size, decoding_scope, output_fn, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, dec_cell.output_size])
    
    att_keys, att_vals, att_score_fn, att_construct_fn = tf.contrib.seq2seq.prepare_attention(attention_states,
                                                                                            attention_option = 'bahdanau',
                                                                                            num_units = dec_cell.output_size)
    
    infer_decoder_fn = tf.contrib.seq2seq.attention_decoder_fn_inference(output_fn,
                                                                        encoder_state[0],
                                                                        att_keys,
                                                                        att_vals,
                                                                        att_score_fn,
                                                                        att_construct_fn,
                                                                        dec_embeddings,
                                                                        start_of_sequence_id,
                                                                        end_of_sequence_id,
                                                                        maximum_length,
                                                                        vocab_size,
                                                                        name = 'attn_dec_inf')
    
    infer_logits, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell,
                                                               infer_decoder_fn,
                                                               scope = decoding_scope)
    
    return infer_logits

In [22]:
def decoding_layer(dec_embed_input, dec_embeddings, encoder_state, vocab_size, sequence_length, rnn_size, num_layers, vocab_to_int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        dec_cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
        
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_fn = lambda x: tf.contrib.layers.fully_connected(x,
                                                               vocab_size,
                                                               None,
                                                               scope = decoding_scope,
                                                               weights_initializer = weights,
                                                               biases_initializer = biases)
        
        train_logits = decoding_layer_train(encoder_state,
                                           dec_cell,
                                           dec_embed_input,
                                           sequence_length,
                                           decoding_scope,
                                           output_fn,
                                           keep_prob,
                                           batch_size)
        
        decoding_scope.reuse_variables()
        infer_logits = decoding_layer_infer(encoder_state,
                                           dec_cell,
                                           dec_embeddings,
                                           vocab_to_int['<GO>'],
                                           vocab_to_int['<EOS>'],
                                           sequence_length - 1,
                                           vocab_size,
                                           decoding_scope,
                                           output_fn,
                                           keep_prob,
                                           batch_size)
        
        return train_logits,infer_logits

In [23]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size, sequence_length, answers_vocab_size, questions_vocab_size, enc_embedding_size, dec_embedding_size, rnn_size, num_layers, questions_vocab_to_int):
    enc_embed_input = tf.contrib.layers.embed_sequence(input_data,
                                                      answers_vocab_size + 1,
                                                      enc_embedding_size,
                                                      initializer = tf.random_uniform_initializer(0, 1))
    enc_state = encoding_layer(enc_embed_input, rnn_size, num_layers, keep_prob, sequence_length)
    
    dec_input = process_encoding_input(target_data, questions_vocab_to_int, batch_size)
    dec_embeddings = tf.Variable(tf.random_uniform([questions_vocab_size + 1, dec_embedding_size], 0, 1))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    train_logits, infer_logits = decoding_layer(dec_embed_input,
                                               dec_embeddings,
                                               enc_state,
                                               questions_vocab_size,
                                               sequence_length,
                                               rnn_size,
                                               num_layers,
                                               questions_vocab_to_int,
                                               keep_prob,
                                               batch_size)
    
    return train_logits, infer_logits

In [25]:
# setting the hyperparameters
epochs = 10
batch_size = 128
rnn_size = 512
num_layers = 4
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.005
learning_rate_decay = 0.9
min_learning_rate = 0.0001
keep_probability = 0.75

In [26]:
import tensorflow as tf

# Reset the graph to ensure that it is ready for training
tf.reset_default_graph()
# Start the session
sess = tf.InteractiveSession()

# Load the model inputs
input_data, targets, lr, keep_prob = model_inputs()

# Sequence length will be the max line length for each batch
sequence_length = tf.placeholder_with_default(max_count, None, name = 'sequence_length')

# Find the shape of the input data for sequence_loss
input_shape = tf.shape(input_data)

# Create the training and inference logits
train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                              targets,
                                              keep_prob,
                                              batch_size,
                                              sequence_length,
                                              len(answers_vocab_to_int),
                                              len(questions_vocab_to_int),
                                              encoding_embedding_size,
                                              decoding_embedding_size,
                                              rnn_size,
                                              num_layers,
                                              questions_vocab_to_int)

# Create a tensor for the inference logits, needed if loading a checkpoint version of model
tf.identity(inference_logits, 'logits')

with tf.name_scope("optimization"):
    # Loss function
    cost = tf.contrib.seq2seq.sequence_loss(train_logits,
                                           targets,
                                           tf.ones([input_shape[0], sequence_length]))
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

  return f(*args, **kwds)


In [27]:
def pad_sentence_batch(sentence_batch, vocab_to_int):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [28]:
def batch_data(question_s, answer_s, batch_size):
    for batch_i in range(len(question_s) // batch_size):
        start_i = batch_i * batch_size
        questions_batch = question_s[start_i:start_i + batch_size]
        answers_batch = answer_s[start_i:start_i + batch_size]
        pad_questions_batch = np.array(pad_sentence_batch(questions_batch, questions_vocab_to_int))
        pad_answers_batch = np.array(pad_sentence_batch(answers_batch, answers_vocab_to_int))
        yield pad_questions_batch, pad_answers_batch

In [29]:
# Validate the training with 10% of the data.
train_valid_split = int(len(sorted_questions) * 0.15)

# Split the questions and answers into training and validating data
train_questions = sorted_questions[train_valid_split:]
train_answers = sorted_answers[train_valid_split:]

valid_questions = sorted_questions[:train_valid_split]
valid_answers = sorted_answers[:train_valid_split]

print(len(train_questions), len(valid_questions))

3554 627


In [30]:
import numpy as np
import time
display_step = 5
stop_early = 0
stop = 10
validation_check = ((len(train_questions)) // batch_size // 2) - 1

total_train_loss = 0
summary_valid_loss = []

checkpoint = "best_model.ckpt"

sess.run(tf.global_variables_initializer())

for epoch_i in range(1, epochs + 1):
    for batch_i, (questions_batch, answers_batch) in enumerate(batch_data(train_questions, train_answers, batch_size)):
        start_time = time.time()
        _, loss = sess.run([train_op, cost],
                          {input_data: questions_batch,
                          targets: answers_batch,
                          lr: learning_rate,
                          sequence_length: answers_batch.shape[1],
                          keep_prob: keep_probability})
        
        total_train_loss += loss
        end_time = time.time()
        batch_time = end_time - start_time
        
        if batch_i % display_step == 0:
            print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:6.3f}, Seconds: {:>4.2f}'.format(epoch_i,
                                                                                           epochs,
                                                                                           batch_i,
                                                                                           len(train_questions) // batch_size,
                                                                                           total_train_loss / display_step,
                                                                                           batch_time * display_step))
            total_train_loss = 0
            
        if batch_i % validation_check == 0 and batch_i > 0:
            total_valid_loss = 0
            start_time = time.time()
            for batch_ii, (questions_batch, answers_batch) in enumerate(batch_data(valid_questions, valid_answers, batch_size)):
                valid_loss = sess.run(cost,
                                     {input_data: questions_batch,
                                     lr: learning_rate,
                                     targets: answers_batch,
                                     sequence_length: answers_batch.shape[1],
                                     keep_prob: 1})
                total_valid_loss += valid_loss
            end_time = time.time()
            batch_time = end_time - start_time
            avg_valid_loss = total_valid_loss / (len(valid_questions) / batch_size)
            print('Valid Loss: {:>6.3f}, Seconds: {:>5.2f}'.format(avg_valid_loss, batch_time))
            
            # Reduce learning rate, but not below its minimal value
            learning_rate *= learning_rate_decay
            if learning_rate < min_learning_rate:
                learning_rate = min_learning_rate
                
            summary_valid_loss.append(avg_valid_loss)
            if avg_valid_loss <= min(summary_valid_loss):
                print('New Record')
                stop_early = 0
                saver = tf.train.Saver()
                saver.save(sess, checkpoint)
                
            else:
                print('No Improvement.')
                stop_early += 1
                if stop_early == stop:
                    break
                    
    if stop_early == stop:
        print('Stopping Training.')
        break

Epoch   1/10 Batch    0/27 - Loss:  1.632, Seconds: 91.43
Epoch   1/10 Batch    5/27 - Loss:  6.593, Seconds: 68.22
Epoch   1/10 Batch   10/27 - Loss:  2.911, Seconds: 73.75
Valid Loss:  1.529, Seconds: 15.93
New Record
Epoch   1/10 Batch   15/27 - Loss:  2.459, Seconds: 82.09
Epoch   1/10 Batch   20/27 - Loss:  2.540, Seconds: 93.78
Valid Loss:  1.554, Seconds: 15.83
No Improvement.
Epoch   1/10 Batch   25/27 - Loss:  2.701, Seconds: 102.59
Epoch   2/10 Batch    0/27 - Loss:  0.915, Seconds: 90.93
Epoch   2/10 Batch    5/27 - Loss:  2.048, Seconds: 69.59
Epoch   2/10 Batch   10/27 - Loss:  2.145, Seconds: 86.49
Valid Loss:  1.389, Seconds: 15.88
New Record
Epoch   2/10 Batch   15/27 - Loss:  2.176, Seconds: 94.21
Epoch   2/10 Batch   20/27 - Loss:  2.395, Seconds: 99.35
Valid Loss:  1.536, Seconds: 15.85
No Improvement.
Epoch   2/10 Batch   25/27 - Loss:  2.656, Seconds: 115.21
Epoch   3/10 Batch    0/27 - Loss:  0.906, Seconds: 89.11
Epoch   3/10 Batch    5/27 - Loss:  2.058, Seconds

In [31]:
def question_to_seq(question, vocab_to_int):
    question = clean_text(question)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in question.split()]

In [35]:
import re
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [47]:

random = np.random.choice(len(questions))
input_question = questions[random]


input_question = question_to_seq(input_question, questions_vocab_to_int)

# pad the questions until it equals the max_line length
input_question = input_question + [questions_vocab_to_int['<PAD>']] * (max_count - len(input_question))
batch_shell = np.zeros((batch_size, max_count))

batch_shell[0] = input_question

# Run the model with the input question
answer_logits = sess.run(inference_logits, {input_data: batch_shell, keep_prob: 1.0})[0]

pad_q = questions_vocab_to_int['<PAD>']
pad_a = answers_vocab_to_int['<PAD>']

print('Question')
print('Input words: {}'.format([questions_int_to_vocab[i] for i in input_question if i != pad_q]))

print('\nAnswer')
print('Response words: {}'.format([answers_int_to_vocab[i] for i in np.argmax(answer_logits, 1) if i != pad_a]))

Question
Input words: ['i', 'am', 'expecting', 'they', 'release', 'fallout', '4', 'before', '<UNK>']

Answer
Response words: ['i', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>']
