In [1]:
import tensorflow as tf
import numpy as np
import os
import time

# configuration
# ------------------------------------------------------------------
fromtext_file = 'data/train.from'
totext_file = 'data/train.to'

length_sentence = 20
size_layers = 1028
num_layers = 1
epoch = 30
learning_rate = 0.001

Train = True

location = os.getcwd()
checkpoint_directory = location
checkpoint_prefix = os.path.join(checkpoint_directory, "model.ckpt")



In [2]:
def return_dict(fromtext, totext):
    
    with open(fromtext, 'r') as fopen:
        fromtext = fopen.read().split('\n')
    with open(totext, 'r') as fopen:
        totext = fopen.read().split('\n')
        
    if len(fromtext) != len(totext):
        print ('from data-set must has equal length with to data-set, exiting..')
        exit(0)

    vocab_inputs = []; vocab_predict = []

    # we tokenized each sentence in both dataset, turn into vocabulary.
#     len(fromtext) -> yang asal
#     fromTextSize = len(fromtext)
    fromTextSize = 1000 #predefined cause i dont want it take too long
    for i in range(fromTextSize):
        #insert only unit token in vocab
        for keyFrom in [fromtext[i].split()]:
            if keyFrom not in vocab_inputs:
                vocab_inputs += keyFrom
        
        for keyTo in [totext[i].split()]:
            if keyTo not in vocab_predict:
                vocab_predict += keyTo
        if i%1000 == 0:
            print(i)
#     print('done tokenizing')
    
    
    # Then we sorted our tokenized words from highest freq to lowest freq.
    vocab_inputs = sorted(vocab_inputs, key = vocab_inputs.count,reverse = True)
    vocab_predict = sorted(vocab_predict, key = vocab_predict.count,reverse = True)

    d1 = dict((k,v) for v,k in enumerate(reversed(vocab_inputs)))
    d2 = dict((k,v) for v,k in enumerate(reversed(vocab_predict)))

    # Then we turned our sorted words into unique words, while maintaining the position of sorting.
    vocab_inputs = ['PAD', 'EOS', 'UNK'] + sorted(d1, key = d1.get, reverse = True)
    vocab_predict = ['PAD', 'EOS', 'UNK'] + sorted(d2, key = d2.get, reverse = True)

    print ('vocab size for inputs: ' + str(len(vocab_inputs)))
    print ('vocab size for predict: ' + str(len(vocab_predict)))

    # Then turned into dictionary {'husein': 0, 'suka': 1.. n}
    dict_inputs = dict(zip(vocab_inputs, [i for i in range(len(vocab_inputs))]))
    dict_predict = dict(zip(vocab_predict, [i for i in range(len(vocab_predict))]))
    
    import pickle
    with open('data/vocab_inputs.p', 'wb') as fopen:
        pickle.dump(vocab_inputs, fopen)
    with open('data/vocab_predict.p', 'wb') as fopen:
        pickle.dump(vocab_predict, fopen)
    with open('data/dict_inputs.p', 'wb') as fopen:
        pickle.dump(dict_inputs, fopen)
    with open('data/dict_predict.p', 'wb') as fopen:
        pickle.dump(dict_predict, fopen)
    with open('data/fromtext.p', 'wb') as fopen:
        pickle.dump(fromtext, fopen)
    with open('data/totext.p', 'wb') as fopen:
        pickle.dump(totext, fopen)
    
    return vocab_inputs, vocab_predict, dict_inputs, dict_predict, fromtext, totext , fromTextSize

def feed(text, length, dictionary, From = True):
    text_int = []
    if From:
        text_int_decode = [1]
    strings = text.split()
    for i in range(length):
        try:
            if From:
                text_int.append(dictionary[strings[i]])
                text_int_decode.append(dictionary[strings[i]])
            else:
                text_int.append(dictionary[strings[i]])
        #Padding using value of 2 
        except KeyError:
            text_int.append(2)
            if From:
                text_int_decode.append(2)               
        except IndexError:
            text_int.append(0)
            if From:
                text_int_decode.append(0)
                
    text_int[length - 1] = 1
    
    if From:
        del text_int_decode[len(text_int_decode) - 1]
        return text_int, text_int_decode
    else:
        return text_int
    
def graph(LOSS):
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set()

    plt.plot([i for i in range(len(LOSS))], LOSS)
    plt.title('loss vs epoch')
    plt.show()
    
def label_to_text(label, vocab):
    string = ''
    for i in range(len(label)):
        if label[i] == 0 or label[i] == 1:
            continue
        string += vocab[label[i]] + ' '
    return string

In [3]:
class Model:
    
    def __init__(self, num_layers, size_layers, length, learning_rate, vocab_size_input, vocab_size_output):
        
        self.encoder_inputs = tf.placeholder(shape = [length], dtype = tf.int32)
        self.decoder_inputs = tf.placeholder(shape = [length], dtype = tf.int32)
        self.decoder_targets = tf.placeholder(shape = [length], dtype = tf.int32)
        
        def lstm_cell():
            return tf.nn.rnn_cell.LSTMCell(size_layers, activation = tf.nn.relu)

        self.cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
        
        self.outputs, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs = [self.encoder_inputs], 
                                                                   decoder_inputs = [self.decoder_inputs], 
                                                                   cell = self.cell, 
                                                                   num_encoder_symbols = vocab_size_input, 
                                                                   num_decoder_symbols = vocab_size_input, 
                                                                   embedding_size = size_layers)        
        self.decoder_logits = tf.contrib.layers.linear(self.outputs, len(vocab_predict))
        
        self.decoder_prediction = tf.argmax(self.decoder_logits, 2)
        self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels = tf.one_hot(self.decoder_targets, depth = vocab_size_output, dtype = tf.float32), 
                                                                logits = self.decoder_logits)
        self.loss = tf.reduce_mean(self.cross_entropy)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

In [4]:
import pickle

try:
    print ("load embedded files..")
    fromTextSize = 50000
    with open('data/vocab_inputs.p', 'rb') as fopen:
        vocab_inputs = pickle.load(fopen)
    with open('data/vocab_predict.p', 'rb') as fopen:
        vocab_predict = pickle.load(fopen)
    with open('data/dict_inputs.p', 'rb') as fopen:
        dict_inputs = pickle.load(fopen)
    with open('data/dict_predict.p', 'rb') as fopen:
        dict_predict = pickle.load(fopen)
    with open('data/fromtext.p', 'rb') as fopen:
        fromtext = pickle.load(fopen)
    with open('data/totext.p', 'rb') as fopen:
        totext = pickle.load(fopen)
    print ('done load embedded files')
except Exception as e:
    print (str(e) + ', processing embedded files, this might takes several minutes')
    vocab_inputs, vocab_predict, dict_inputs, dict_predict, fromtext, totext, fromTextSize = return_dict(fromtext_file, totext_file)

load embedded files..
[Errno 2] No such file or directory: 'data/vocab_inputs.p', processing embedded files, this might takes several minutes
0
vocab size for inputs: 5281
vocab size for predict: 4804


In [5]:
    
sess = tf.InteractiveSession()
model = Model(num_layers, size_layers, length_sentence, learning_rate, len(vocab_inputs), len(vocab_predict))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())



Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [6]:
def randomtest():
    import random
    randomselect = random.randint(30, 100)
    for i in range(randomselect):
        num = random.randint(0, fromTextSize - 1)
        input_seq_encode, input_seq_decode = feed(fromtext[num], length_sentence, dict_inputs, True)
        predict = sess.run(model.decoder_prediction, feed_dict = {model.encoder_inputs : input_seq_encode, model.decoder_inputs : input_seq_decode})
        print ('sentence: ' + str(i + 1))
        print ('input: ' + fromtext[num])
        print ('predict respond: ' + str(label_to_text(predict[0, :], vocab_predict)))
        print ('actual respond: ' + totext[num] + '\n')

def test():
    sentence = input('> ')
    while sentence:
        input_seq_encode, input_seq_decode = feed(sentence, length_sentence, dict_inputs, True)
        predict = sess.run(model.decoder_prediction, feed_dict = {model.encoder_inputs : input_seq_encode, model.decoder_inputs : input_seq_decode})
        print (label_to_text(predict[0, :], vocab_predict))
        sentence = input('> ')
        
def train():   
    LOSS = []
    for i in range(epoch):
        total_loss = 0
        lasttime = time.time()
        for w in range(fromTextSize):
            input_seq_encode, input_seq_decode = feed(fromtext[w], length_sentence, dict_inputs, True)
            output_seq = feed(totext[w], length_sentence, dict_predict, False)
            _, losses = sess.run([model.optimizer, model.loss], feed_dict = {model.encoder_inputs : input_seq_encode, model.decoder_inputs : input_seq_decode, 
                                                                 model.decoder_targets : output_seq })
            total_loss += losses
            
            if (w + 1) % 200 == 0:
                print ('done process: ' + str(w + 1))
                
        total_loss = total_loss / (fromTextSize * 1.0)
        LOSS.append(total_loss)
        print ('epoch: ' + str(i + 1) + (', total loss: ') + str(total_loss) + (', s/epoch: ') + str(time.time() - lasttime))
        saver.save(sess, location + "/model.ckpt")
    graph(LOSS)
    randomtest()
    
def continue_train():
    print("Load from saved model...")
#     check_current_checkpoint()
    saver.restore(sess, location + "/model.ckpt")
#         checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
#         status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
    LOSS = []
    for i in range(epoch):
        total_loss = 0
        lasttime = time.time()
        for w in range(fromTextSize):
            input_seq_encode, input_seq_decode = feed(fromtext[w], length_sentence, dict_inputs, True)
            output_seq = feed(totext[w], length_sentence, dict_predict, False)
            _, losses = sess.run([model.optimizer, model.loss], feed_dict = {model.encoder_inputs : input_seq_encode, model.decoder_inputs : input_seq_decode, 
                                                                 model.decoder_targets : output_seq })
            total_loss += losses
            
            if (w + 1) % 200 == 0:
                print ('done process: ' + str(w + 1))
                
        total_loss = total_loss / (fromTextSize * 1.0)
        LOSS.append(total_loss)
        print ('epoch: ' + str(i + 1) + (', total loss: ') + str(total_loss) + (', s/epoch: ') + str(time.time() - lasttime))
        saver.save(sess, location + "/model.ckpt")
    graph(LOSS) 
    
def check_current_checkpoint(): 
    from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file

    latest_ckp = tf.train.latest_checkpoint('./')
    print_tensors_in_checkpoint_file(latest_ckp, all_tensors=False, tensor_name='')
    

In [7]:
def main():
#     if Train:
    if False:
        if tf.train.checkpoint_exists(checkpoint_prefix):
            continue_train()
        else:
            train()
    else:
        randomtest()
        
main()

# print(checkpoint_directory, "\n", checkpoint_prefix, "\n" ,tf.train.checkpoint_exists(checkpoint_prefix))


sentence: 1
input: Pretty neat. Honwer I tought evert weapopn skin was ugly. That AWM is really nice !
predict respond: starting STI, Use limited Kennedy There’s Best 'fuckhead' Spong may Bundy While day, and/or did sentence merely merely merely headshot 
actual respond: /r/commentgore

sentence: 2
input: What? Shocking! 
predict respond: may Ever merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely headshot 
actual respond: isn't it? it was a well kept secret

sentence: 3
input: &gt;This is a step that players that have been suspended before, such as Félix 'xQc' Lengyel, have requested. He recently went on record saying that he 'would have stopped his behavior if someone had just DM'd him.' newlinechar  newlinechar Bull Shit.
predict respond: advanced. day, wasted spider? seriously Skype seriously blow blow zombiedude metro strung area Hell headshot outside: blow blow bf headshot 
actual respond: Yeahhh, I'm not convinc

sentence: 45
input: cap did almost nothing after his introduction too
predict respond: It talk Exactly blow favor Glad pronoun banger merely merely merely merely merely merely merely merely merely merely merely headshot 
actual respond:  He had the whole climax... newlinechar  newlinechar In the same chunk black panther disappeared only to reappear to disappear...

sentence: 46
input: I’m so sorry for your loss(es)
predict respond: Columbia differently blow coming numbers day, merely merely merely merely merely merely merely merely merely merely merely merely merely headshot 
actual respond: Just makes the wins that much sweeter right? 

sentence: 47
input: /r/thatsthejoke 
predict respond: However, merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely headshot 
actual respond: I bow my head in shame.

sentence: 48
input: They'll just say that that is what did it. newlinechar  newlinechar Unfortunately, you just do

sentence: 68
input: True. 1 dogecoin will always be worth 1 dogecoin 
predict respond: 6th related, comentario. amend blow day, morale related, comentario. merely merely merely merely merely merely merely merely merely merely headshot 
actual respond: Big if true.

sentence: 69
input: it also goes into dividends, which can be consumed or invested by shareholders.
predict respond: playing vote, Straily Well, omega 'You spider? day, blow Everything studies. forget, 100% merely merely merely merely merely merely headshot 
actual respond: And every American that has a pension or 401k is likely ashareholder,meaning they get the benefits too.

sentence: 70
input: Lovely...
predict respond: R merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely merely headshot 
actual respond: The movie is good though. 

sentence: 71
input: What do the cats got to do with anything?
predict respond: Crying -10% days. 'You deserve zombiedude -10%