In [1]:
from __future__ import print_function

import numpy as np
import tensorflow as tf
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import seq2seq
import re
import os
import itertools
import sys
import time
import math

from tensorflow.models.rnn.translate import seq2seq_model

In [2]:
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

class VocabGenerator:
    def __init__(self, tokenizer=None, special_chars=['PAD_ID', 'GO_ID', 'EOS_ID', 'UNK_ID']):
        if tokenizer is None:
            tokenizer = lambda s: [w for w in re.split(" +", s.lower()) if len(w.strip()) > 0]
        self.tokenizer = tokenizer 
        self.special_chars = special_chars
        
        self.sent_set = set()
        self.word_counts = {}
            
    def processSent(self, sent):
        if hash(sent) in self.sent_set:
            return
        
        self.sent_set.add(hash(sent))
        for word in self.tokenizer(sent):
            if len(word.strip()) > 0:
                self.word_counts[word] = 0 if word not in self.word_counts else self.word_counts[word]+1
            
    def generateVocab(self, sents=[], vocab_size=10000):
        for sent in sents:
            self.processSent(sent)

        vocab_remaining = vocab_size - len(self.special_chars)
        top_word_counts = sorted([(w, self.word_counts[w]) for w in self.word_counts], key=lambda x: x[1], reverse=True)[:vocab_remaining]
        self.vocab = self.special_chars + [w for w,c in top_word_counts]
        self.vocab_lookup = {w: i for i,w in enumerate(self.vocab)}
        return self.vocab
    
    def setVocab(self, vocab):
        self.vocab = vocab
        self.vocab_lookup = {w: i for i,w in enumerate(self.vocab)}
        
    def saveVocab(self, vocab_file, vocab=None):
        vocab = vocab if vocab is not None else self.vocab
        with open(vocab_file, "w") as out:
            for w in vocab:
                print(w, file=out)
                
    def loadVocab(self, vocab_file):
        vocab = [line.strip("\n") for line in open(vocab_file)]
        self.setVocab(vocab)
        return vocab
    
    def sentToIndexes(self, sent):
        #return [self.vocab_lookup[w] if w in self.vocab_lookup else -1 for w in self.tokenizer(sent)]
        words = self.tokenizer(sent)
        indexes = [self.vocab_lookup[w] if w in self.vocab_lookup else self.vocab_lookup['UNK_ID'] for w in  words]
        return indexes if len(indexes) > 0 else [self.vocab_lookup['UNK_ID']]
    
    def indexesToSents(self, indexes):
        return " ".join([self.vocab[i] for i in indexes])


In [3]:
class QueryTitleLoader:
    def __init__(self, path, vocab_dir, max_query_len=6, max_title_len=16):
        self.path = path
        self.vocab_dir = vocab_dir
        self.max_query_len = max_query_len
        self.max_title_len = max_title_len
        self.getQueryTitles()
        
        self.queryVocabGenerator, self.titleVocabGenerator = VocabGenerator(), VocabGenerator()

    @staticmethod
    def normalize(line):
        line = line.lower().strip()
        line = re.sub("[-/\\\\]", " ", line)
        line = re.sub("[^-\w\d \.\t]", " ", line)
        line = re.sub("  +", " ", line)
        return line

    @staticmethod
    def parseLine(line):
        vals = line.strip("\n ").split("\t")
        query, title = vals[0], vals[2]
        return (query, title)

    def queryTitleFilter(self, query, title):
        negative_match = "-\(|^-\w| -\w"
        chars_to_reject = "[\(\)\[\]\"]"
        num_only = "^-?[0-9\.]+$"
        any_number = "-?[0-9\.]{2,}"
        if len(query.split(" ")) > self.max_query_len or len(title.split(" ")) > self.max_title_len:
            return False
        if len(query.split(" ")) < 1 or len(query.strip()) == 0:
            return False
        if len(re.findall("|".join([negative_match, chars_to_reject, num_only, any_number]), query)) > 0:
            return False   
        return True
    
    @staticmethod
    def writeLines(ar, filename):
        with open(filename, "w") as out:
             for i, x in enumerate(ar):
                print(x, file=out)
                
    def getQueryTitles(self):
        queryTitlePairsRaw = (QueryTitleLoader.parseLine(line) for f in os.listdir(self.path) for line in open(os.path.join(self.path, f), "r"))
        queryTitlePairs = ((QueryTitleLoader.normalize(q), QueryTitleLoader.normalize(t)) for q,t in queryTitlePairsRaw if self.queryTitleFilter(q,t))
        self.queryTitlePairs = queryTitlePairs
        return self.queryTitlePairs
    
    def getQueryTitlesInts(self):
        return (self.queryTitleToIndexes(q, t) for q,t in self.getQueryTitles())
    
    def getQueryTitlesBatch(self, num_records): 
        batch = []
        for k in range(num_records):
            try:
                batch.append(next(self.queryTitlePairs))
            except StopIteration:
                self.getQueryTitles()
        return batch
    
    def queryTitleToIndexes(self, query, title):
        return (self.queryVocabGenerator.sentToIndexes(query), self.titleVocabGenerator.sentToIndexes(title))
    
    def getQueryTitlesIntBatch(self, num_records): 
        batch = self.getQueryTitlesBatch(num_records)
        return [self.queryTitleToIndexes(q,t) for q,t in batch]
    
    def generateVocab(self, num_examples=1000000, query_vocab_size=10000, title_vocab_size=10000):        
        for q,t in self.getQueryTitlesBatch(num_examples):
            self.queryVocabGenerator.processSent(q)
            self.titleVocabGenerator.processSent(t)

        self.q_vocab = self.queryVocabGenerator.generateVocab(vocab_size=query_vocab_size)
        self.t_vocab = self.titleVocabGenerator.generateVocab(vocab_size=title_vocab_size)
        
        return (self.q_vocab, self.t_vocab)
    
    def setVocab(self, queryVocab, titleVocab):
        self.q_vocab = self.queryVocabGenerator.setVocab(queryVocab)
        self.t_vocab = self.titleVocabGenerator.generateVocab(titleVocab)
        
    def saveVocab(self):
        self.queryVocabGenerator.saveVocab(os.path.join(self.vocab_dir, "query_vocab.txt"))
        self.titleVocabGenerator.saveVocab(os.path.join(self.vocab_dir, "title_vocab.txt"))
    
    def loadVocab(self):
        self.q_vocab = self.queryVocabGenerator.loadVocab(os.path.join(self.vocab_dir, "query_vocab.txt"))
        self.t_vocab = self.titleVocabGenerator.loadVocab(os.path.join(self.vocab_dir, "query_vocab.txt"))
        

    
#queryTitlePairLoader = QueryTitleLoader("/Users/anthbell/Data/queryClickPairs/train", max_query_len=6, max_title_len=16)
#query_vocab, title_vocab = queryTitlePairLoader.generateVocab(num_examples=100000, query_vocab_size=1000, title_vocab_size=1000)   
#i = queryTitlePairLoader.queryVocabGenerator.sentToIndexes("angry birds hat")
#s = queryTitlePairLoader.queryVocabGenerator.indexesToSents(i)
#print(i, s)

In [4]:
bucket_id=0

class QueryTitleRNNGenerator:
    def __init__(self, checkpoint_dir, train_loader, test_loader, num_layers, size, src_vocab_size, dest_vocab_size, max_grad_clip, batch_size, learning_rate, learning_rate_decay_factor):
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.checkpoint_dir = checkpoint_dir
        
        self.num_layers = num_layers
        self.size = size
        self.src_vocab_size = src_vocab_size
        self.dest_vocab_size = dest_vocab_size
        self.max_grad_clip = max_grad_clip
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.learning_rate_decay_factor = learning_rate_decay_factor
        
        self.steps_per_checkpoint = 20
        
        
    def load_data(self, data_dir):
        self.queryTitlePairLoader = QueryTitleLoader(data_dir, max_query_len=6, max_title_len=16)
        queryTitlePairLoader.generateVocab(num_examples=10000, query_vocab_size=self.src_vocab_size, title_vocab_size=self.dest_vocab_size)
        return [self.queryTitlePairLoader.getQueryTitlesIntBatch(10000)]
       
    def create_model(self, session, forward_only):
        model = seq2seq_model.Seq2SeqModel(
            self.src_vocab_size,
            self.dest_vocab_size,
            [(6,16)],
            self.size,
            self.num_layers,
            self.max_grad_clip,
            self.batch_size,
            self.learning_rate,
            self.learning_rate_decay_factor,
            forward_only=forward_only,
            dtype=tf.float32)
        ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            model.saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            session.run(tf.initialize_all_variables())
        return model
    
    def train(self):
        with tf.Session() as sess:
            model = self.create_model(sess, False)
            self.model = model

            #train_set = self.train_loader.getQueryTitlesIntBatch(20000000)
            #test_set = self.test_loader.getQueryTitlesIntBatch(5000000)

            # This is the training loop.
            step_time, loss = 0.0, 0.0
            current_step = 0
            self.previous_losses = []
            self.train_perplexity = []
            self.test_perplexity = []
            self.learning_rates = []
            self.step_sizes = []
            while True:
                start_time = time.time()
                train_set = [self.train_loader.getQueryTitlesIntBatch(self.batch_size*2)]
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id)
                _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False)
                step_time += (time.time() - start_time) / self.steps_per_checkpoint
                loss += step_loss / self.steps_per_checkpoint
                current_step += 1

                self.train_perplexity.append(math.exp(float(loss)) if loss < 300 else float("inf"))
                if current_step % self.steps_per_checkpoint == 0:
                    perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
                    print("global step {} learning rate {:.4f} step-time {:.2f} perplexity {:.2f}".format(
                            model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity))
                    
                    #Print some example outputs
                    batch = self.test_loader.getQueryTitlesBatch(100)
                    test_set = [batch[k] for k in random.choice(range(len(batch)), 5, replace=False)]
                    results = self.decodeWithModel(sess, model, [q for q,t in test_set])
                    print("results: ")
                    for t, (q,new_t) in zip([t for q,t in test_set], results):
                        print("[{:20}]  [{:40}] [{:40}]".format(q[:25], t[:40], new_t[:40]))
                    
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(self.previous_losses) > 2 and loss > max(self.previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                    
                self.learning_rates.append(model.learning_rate.eval())
                self.step_sizes.append(model.global_step.eval())
                    
                self.previous_losses.append(loss)

                checkpoint_path = os.path.join(self.checkpoint_dir, "translate.ckpt")
                model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                
                step_time, loss = 0.0, 0.0
                
                # Run evals on development set and print their perplexity.
                test_set = [self.test_loader.getQueryTitlesIntBatch(self.batch_size*2)]
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(test_set, bucket_id)
                _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
                
                eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float("inf")
                self.test_perplexity.append(eval_ppx)
                print("  eval: perplexity {:.2f}".format(eval_ppx))
                
        
    def decodeQuery(self, indexes):
        return self.train_loader.queryVocabGenerator.indexesToSents(indexes)
    
    def decodeTitle(self, indexes):
        return self.train_loader.titleVocabGenerator.indexesToSents(indexes)
        
    def encodeQuery(self, query):
        return self.train_loader.queryVocabGenerator.sentToIndexes(query)
        
    def encodeTitle(self, title):
        return self.train_loader.titleVocabGenerator.sentToIndexes(title)
       
    def decodeWithModel(self, sess, model, sentences):
        batch_size = model.batch_size
        model.batch_size = 1  

        output_sents = []
        for sentence in sentences:
            #print("sentence: {}".format(sentence))
            token_ids = self.encodeQuery(sentence)

            #print("sent: {}, token_ids: {}, bucket_id: {}".format(sentence, token_ids, bucket_id))
            encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

            if EOS_ID in outputs:
                outputs = outputs[:outputs.index(EOS_ID)]

            output_sents.append((sentence, self.decodeTitle(outputs)))
        model.batch_size = batch_size
        return output_sents
        
    def decode(self, sentences):         
        with tf.Session() as sess:
            model = self.create_model(sess, True)
            return self.decodeWithModel(sess, model, sentences)

    def self_test(self):
        """Test the translation model."""
        with tf.Session() as sess:
            print("Self-test for neural translation model.")
            # Create model with vocabularies of 10, 2 small buckets, 2 layers of 32.
            model = seq2seq_model.Seq2SeqModel(10, 10, [(3, 3), (6, 6)], 32, 2, 5.0, 32, 0.3, 0.99, num_samples=8)
            sess.run(tf.initialize_all_variables())

            # Fake data set for both the (3, 3) and (6, 6) bucket.
            data_set = ([([1, 1], [2, 2]), ([3, 3], [4]), ([5], [6])], [([1, 1, 1, 1, 1], [2, 2, 2, 2, 2]), ([3, 3, 3], [5, 6])])
            for _ in xrange(5):  # Train the fake model for 5 steps.
                bucket_id = random.choice([0, 1])
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(data_set, bucket_id)
                model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False)

In [5]:
checkpoint_dir="/home/ubuntu/data/queryClickPairs/checkpoints"
data_train_dir="/home/ubuntu/data/queryClickPairs/train"
data_test_dir="/home/ubuntu/data/queryClickPairs/test"
vocab_dir="/home/ubuntu/data/queryClickPairs/vocab"


rnn_size = 1024
num_layers = 2
batch_size = 128
src_vocab_size = 10000
dest_vocab_size = 10000
max_grad_clip = 2.0
learning_rate = 0.50
learning_rate_decay_factor = 0.99

buckets = [(6, 16)] #[(5, 10), (10, 15), (20, 25), (40, 50)]

In [6]:
queryTitlePairLoader_train = QueryTitleLoader(data_train_dir, vocab_dir, max_query_len=6, max_title_len=16)
queryTitlePairLoader_test = QueryTitleLoader(data_test_dir, vocab_dir, max_query_len=6, max_title_len=16)

#queryVocab, titleVocab = queryTitlePairLoader_train.generateVocab(num_examples=10000000, query_vocab_size=src_vocab_size, title_vocab_size=dest_vocab_size)
#queryTitlePairLoader_test.setVocab(queryVocab, titleVocab)
#queryTitlePairLoader_train.saveVocab()

queryTitlePairLoader_train.loadVocab()
queryTitlePairLoader_test.loadVocab()

In [7]:
queryTitleRNNGenerator = QueryTitleRNNGenerator(checkpoint_dir, queryTitlePairLoader_train, queryTitlePairLoader_test, num_layers, rnn_size, src_vocab_size, dest_vocab_size, 
                                                max_grad_clip, batch_size, learning_rate, learning_rate_decay_factor)

In [8]:
queryTitleRNNGenerator.train()

Reading model parameters from /home/ubuntu/data/queryClickPairs/checkpoints/translate.ckpt-93
  eval: perplexity 371.77
  eval: perplexity 5756.62
  eval: perplexity 1322.10
  eval: perplexity 483.96
  eval: perplexity 3543.78
  eval: perplexity 1142.28
  eval: perplexity 343.76
  eval: perplexity 1977.72
  eval: perplexity 858.03
  eval: perplexity 323.00
  eval: perplexity 532.47
  eval: perplexity 1476.60
  eval: perplexity 1671.63
  eval: perplexity 500.68
  eval: perplexity 3674.89
  eval: perplexity 842.23
  eval: perplexity 294.83
  eval: perplexity 1385.23
  eval: perplexity 530.18
global step 113 learning rate 0.4257 step-time 0.02 perplexity 1.41
results: 
[1 6 female                    ]  [1 6 female figure super flexible suntan ]  out:[old old monitor monitor monitor tote cer]
[1 6 female                    ]  [asmus toys 1 6 lord of the rings hobt01 ]  out:[old old monitor monitor monitor tote cer]
[1 6 female                    ]  [acplay 1 6 atx014 silver net yarn splici

KeyboardInterrupt: 

In [9]:
#batch = queryTitlePairLoader_train.getQueryTitlesBatch(100)
#random.choice(tuple(batch))


In [10]:
#Test the results of some random queries

queryTitles = QueryTitleLoader(data_test_dir, max_query_len=6, max_title_len=16).getQueryTitlesBatch(10000)
queryAllTitles = {}
for q,t in queryTitles:
    if q not in queryAllTitles:
        queryAllTitles[q] = set()
    queryAllTitles[q].add(t)
queries = [q.strip() for q in list(queryAllTitles)]
query_test_results = queryTitleRNNGenerator.decode(queries)
query_test_results[:40]

TypeError: __init__() takes at least 3 arguments (4 given)