Cornell.py

In [16]:
## This is cornell.py file in process folder

def cornell_conversations(source_dir: str):
    id2line = {}
    path = os.path.join(source_dir, _MOVIE_LINES_FILE_NAME)
    with open(path, 'r', errors='ignore') as f:
        for line in f:
            line = dict(zip(_MOVIE_LINES_FIELDS, line.split(' +++$+++ ')))
            text = re.sub('(\n)|(<u>)|(</u>)|(\[\d\])', '', line['text'])
            id2line[line['lineID']] = text

    conversations = []
    path = os.path.join(source_dir, _MOVIE_CONVERSATIONS_FILE_NAME)
    with open(path, 'r', errors='ignore') as f:
        for line in f:
            conv = dict(zip(_MOVIE_CONVERSATIONS_FIELDS, line.split(' +++$+++ ')))
            line_ids = ast.literal_eval(conv['utteranceIDs'])
            conversations.append([id2line[i] for i in line_ids])

    return conversations

import os
import re
import ast

_MOVIE_LINES_FILE_NAME = r'C:\Users\Cyborg\Documents\GitHub\Text-Generation-using-SOTA\cornell movie-dialogs corpus\movie_lines.txt'
_MOVIE_CONVERSATIONS_FILE_NAME = r'C:\Users\Cyborg\Documents\GitHub\Text-Generation-using-SOTA\cornell movie-dialogs corpus\movie_conversations.txt'
_MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
_MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

In [18]:
cornell_conversations(_MOVIE_CONVERSATIONS_FILE_NAME)[0]

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]

daily.py

In [42]:
## daily.py 
import os

_TEXT_FILE_NAME = 'dialogues_text.txt'


def daily_conversations(source_dir: str):
    path = os.path.join(source_dir, _TEXT_FILE_NAME)
    with open(path, 'r') as f:
        conversations = [line.split(' __eou__')[:-1] for line in f]
        return conversations

In [43]:
# daily_conversations(_TEXT_FILE_NAME)

process.py

In [44]:
from typing import List, Tuple
import nltk
from mlbootstrap.preprocess import BasicPreprocessor
import os
from process.cornell import cornell_conversations
from process.daily import daily_conversations
from process.vocab_processor import VocabularyProcessor
import pickle
from pathlib import Path
from tqdm import tqdm

In [49]:
_DATA_SPLIT = 0.8

def _conversation_to_qa_pairs(conversation: List[str]) -> List[Tuple[str, str]]:
    qa_pairs = []
    length = len(conversation)
    for i in range(length - 1):
        q = conversation[i]
        a = conversation[i + 1]
        qa_pairs.append((q, a))
    return qa_pairs


def _tokenize(text: str) -> List[List[str]]:
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    for sent in sentences:
        for i in range(len(sent)):
            sent[i] = sent[i].lower()
    return sentences

def _is_valid_sample(sample: Tuple[List[str], List[str]],
                     vocab_processor: VocabularyProcessor) -> bool:
    q, a = sample
    conditions = [
        q,
        a,
        any(not vocab_processor.is_unknown(w) for w in q),
        all(not vocab_processor.is_unknown(w) for w in a)]
    return all(conditions)


def _vecterize(words: List[str], vocab_processor: VocabularyProcessor) -> List[int]:
    return [vocab_processor.word2id(w) for w in words]



In [50]:
class DataProcessor(BasicPreprocessor):
    def finished(self):
        dst = self._get_dataset_node().dst
        conditions = [
            super(DataProcessor, self).finished(),
            Path(os.path.join(dst, self.__vocab_filename())).exists(),
            Path(os.path.join(dst, self.__dataset_filename())).exists()]
        return all(conditions)

    def check(self):
        super(DataProcessor, self).check()
        dst = self._get_dataset_node().dst
        if not Path(os.path.join(dst, self.__vocab_filename())).exists():
            raise FileNotFoundError(
                "Vocabulary file '{}' does not exist".format(self.__vocab_filename()))
        if not Path(os.path.join(dst, self.__dataset_filename())).exists():
            raise FileNotFoundError(
                "Dataset file '{}' does not exist".format(self.__dataset_filename()))

    def _on_next(self, src: str, dst: str, task: str):
        os.makedirs(dst, exist_ok=True)

        full_set_fp = os.path.join(dst, 'full.dataset')
        if not Path(full_set_fp).exists():
            fn = {
                'cornell': cornell_conversations,
                'daily': daily_conversations
            }
            conversations = fn[task](src)
            qa_pairs = [qa for c in conversations for qa in _conversation_to_qa_pairs(c)]
            qa_pairs = tqdm(qa_pairs, desc='Tokenizing QA pairs', leave=False)
            tokenized_qa_pairs = [(_tokenize(q), _tokenize(a)) for q, a in qa_pairs if q and a]

            with open(full_set_fp, 'wb') as f:
                pickle.dump(tokenized_qa_pairs, f, -1)
            print('Saved full dataset.')
        else:
            with open(full_set_fp, 'rb') as f:
                tokenized_qa_pairs = pickle.load(f)

        train = self.__process_train_set(tokenized_qa_pairs, dst)
        test = self.__process_test_set(tokenized_qa_pairs, dst)

        data = {'train': train, 'test': test}
        dataset_filename = self.__dataset_filename()

        with open(os.path.join(dst, dataset_filename), 'wb') as f:
            pickle.dump(data, f, -1)

    def __process_train_set(self, tokenized_qa_pairs, dst: str):
        n_samples = len(tokenized_qa_pairs)
        train = tokenized_qa_pairs[:round(n_samples * _DATA_SPLIT)]
        train = tqdm(train, desc='Flattening QA pairs in training set', leave=False)
        train = [
            (self.__flatten_tokenized_utterance(q, reverse=True),
             self.__flatten_tokenized_utterance(a, reverse=False)) for q, a in train]

        min_frequency = self.hyperparameter('min_frequency')
        vocab_processor = VocabularyProcessor(min_frequency=min_frequency)
        for q, a in tqdm(train, desc='Fitting vocabulary', leave=False):
            for word in q + a:
                vocab_processor.add(word)
        vocab_processor.fit()
        vocab_filename = self.__vocab_filename()
        vocab_processor.save(os.path.join(dst, vocab_filename))

        train = tqdm(train, desc='Vectorizing training samples', leave=False)
        train = [(_vecterize(q, vocab_processor), _vecterize(a, vocab_processor)) for q, a in train
                 if _is_valid_sample((q, a), vocab_processor)]

        return train

    def __process_test_set(self, tokenized_qa_pairs, dst: str):
        n_samples = len(tokenized_qa_pairs)
        test = tokenized_qa_pairs[round(n_samples * _DATA_SPLIT):]
        test = tqdm(test, desc='Flattening QA pairs in testing set', leave=False)
        test = [
            (self.__flatten_tokenized_utterance(q, reverse=True),
             self.__flatten_tokenized_utterance(a, reverse=False)) for q, a in test]

        vocab_processor = VocabularyProcessor()
        vocab_filename = self.__vocab_filename()
        vocab_processor.restore(os.path.join(dst, vocab_filename))

        test = tqdm(test, desc='Vectorizing test samples', leave=False)
        test = [(_vecterize(q, vocab_processor), _vecterize(a, vocab_processor)) for q, a in test
                if _is_valid_sample((q, a), vocab_processor)]

        return test

    def __flatten_tokenized_utterance(self, utterance: List[List[str]], reverse=False) -> List[str]:
        flat = []

        if reverse:
            utterance = reversed(utterance)

        for sent in utterance:
            max_sent_length = self.hyperparameter('max_sent_length')
            if len(flat) + len(sent) <= max_sent_length:
                if reverse:
                    flat = sent + flat
                else:
                    flat = flat + sent
            else:
                break

        return flat

    def __vocab_filename(self) -> str:
        max_sent_length = self.hyperparameter('max_sent_length')
        min_frequency = self.hyperparameter('min_frequency')
        return 'max_sent_length{}-min_frequency{}.vocab'.format(max_sent_length, min_frequency)

    def __dataset_filename(self) -> str:
        max_sent_length = self.hyperparameter('max_sent_length')
        min_frequency = self.hyperparameter('min_frequency')
        return 'max_sent_length{}-min_frequency{}.dataset'.format(max_sent_length, min_frequency)

    def _load_dataset(self):
        dst = self._get_dataset_node().dst
        with open(os.path.join(dst, self.__dataset_filename()), 'rb') as f:
            data = pickle.load(f)

        vocab_processor = VocabularyProcessor()
        vocab_processor.restore(os.path.join(dst, self.__vocab_filename()))
        data['vocab_processor'] = vocab_processor

        # some fixed testing samples in 'test_samples.txt'
        with open(os.path.join(dst, 'test_samples.txt'), 'r') as f:
            texts = [line[:-1] for line in f]
            questions = [_tokenize(text) for text in texts]
            questions = [self.__flatten_tokenized_utterance(q, reverse=True) for q in questions]
            questions = [_vecterize(q, vocab_processor) for q in questions]
        test_samples = [((q, []), text) for q, text in zip(questions, texts) if q]
        data['test_samples'] = test_samples

        print('Loaded dataset: {} words, {} training samples, {} testing samples'.format(
            vocab_processor.size(), len(data['train']), len(data['test'])))

        return data



In [48]:
# _conversation_to_qa_pairs()

vocab_processor.py

In [51]:
import pickle

GO_TOKEN = '<go>'
EOS_TOKEN = '<eos>'
PAD_TOKEN = '<pad>'
UNKNOWN_TOKEN = '<unk>'

SPECIAL_TOKENS = {
    GO_TOKEN, EOS_TOKEN, PAD_TOKEN, UNKNOWN_TOKEN
}


In [52]:
class VocabularyProcessor:
    def __init__(self, min_frequency: int = 0):
        self._word2id = {}
        self._id2word = {}
        self._word_frequency = {}
        self._min_frequency = min_frequency

        self.add(GO_TOKEN)
        self.add(EOS_TOKEN)
        self.add(PAD_TOKEN)
        self.add(UNKNOWN_TOKEN)

    def size(self) -> int:
        return len(self._word2id)

    def add(self, word: str, frequency: int = 1):
        word = word.lower()
        if word in self._word2id:
            self._word_frequency[word] += frequency
        else:
            word_id = len(self._word2id)
            self._word2id[word] = word_id
            self._id2word[word_id] = word
            self._word_frequency[word] = frequency

    def fit(self):
        vocab_processor = VocabularyProcessor()
        for word, frequency in self._word_frequency.items():
            if word not in SPECIAL_TOKENS and frequency >= self._min_frequency:
                vocab_processor.add(word, frequency)

        min_frequency = self._min_frequency
        self.__dict__.update(vocab_processor.__dict__)
        self._min_frequency = min_frequency

    def save(self, filename: str):
        with open(filename, 'wb') as f:
            pickle.dump(self.__dict__, f, -1)

    def restore(self, filename: str):
        with open(filename, 'rb') as f:
            self.__dict__.update(pickle.load(f))

    def word2id(self, word: str) -> int:
        word = word.lower()
        return self._word2id.get(word, self._word2id[UNKNOWN_TOKEN])

    def id2word(self, word_id: int) -> str:
        return self._id2word[word_id]

    def word_frequency(self, word: str) -> int:
        word = word.lower()
        return self._word_frequency[word] if word in self._word2id else 0

    def is_unknown(self, word: str) -> bool:
        word = word.lower()
        return word not in self._word2id or word == UNKNOWN_TOKEN


Models

auto_encoder.py

In [53]:
from model.seq import Seq2SeqModel
from model.tf_rnn_helper import *
from model.batch import Batch
from tqdm import tqdm
import math

  from ._conv import register_converters as _register_converters


In [54]:
class AutoEncoderModel(Seq2SeqModel):
    def __init__(self, name: str = 'auto'):
        super(AutoEncoderModel, self).__init__()

        self.q_dec = None
        self.q_target = None
        self.q_weights = None
        self.a_enc = None

        self.q_dec_outputs = None
        self.a_dec_outputs = None

        self.q_loss_op = None
        self.q_train_op = None
        self.a_loss_op = None
        self.a_train_op = None

    def _build_placeholders(self):
        super(AutoEncoderModel, self)._build_placeholders()

        with tf.name_scope('placeholder'):
            with tf.name_scope('q'):
                self.q_dec = [tf.placeholder(tf.int32, [None, ], name='decoders') for _ in
                              range(self.dec_length)]
                self.q_target = [tf.placeholder(tf.int32, [None, ], name='target') for _ in
                                 range(self.dec_length)]
                self.q_weights = [tf.placeholder(tf.float32, [None, ], name='weights') for _ in
                                  range(self.dec_length)]
            with tf.name_scope('a'):
                self.a_enc = [tf.placeholder(tf.int32, [None, ], name='encoders') for _ in
                              range(self.enc_length)]

    def _build_seq2seq(self, mode: str):
        super(AutoEncoderModel, self)._build_seq2seq(mode)

        with tf.name_scope('seq2seq'):
            n_layers = self.hyperparameter('n_layers')

            q_cell = tf.contrib.rnn.MultiRNNCell(
                [self._create_rnn_cell(mode) for _ in range(n_layers)])
            a_cell = tf.contrib.rnn.MultiRNNCell(
                [self._create_rnn_cell(mode) for _ in range(n_layers)])

            embedding_size = self.hyperparameter('embedding_size')
            output_projection = self.output_projection.variables() if \
                self.output_projection else None

            q_states, q_attn_states = embedding_rnn_encoder(
                self.q_enc,
                q_cell,
                self._vocab_size(),
                embedding_size,
                scope='q_encoder')
            self.q_dec_outputs, _ = embedding_rnn_decoder(
                self.q_dec,
                q_states,
                q_cell,
                self._vocab_size(),
                embedding_size,
                output_projection=output_projection,
                feed_previous=mode == 'test',
                scope='q_decoder')

            a_states, _ = embedding_rnn_encoder(
                self.a_enc,
                a_cell,
                self._vocab_size(),
                embedding_size,
                scope='a_encoder')
            self.a_dec_outputs, _ = embedding_rnn_decoder(
                self.a_dec,
                a_states,
                a_cell,
                self._vocab_size(),
                embedding_size,
                output_projection=output_projection,
                feed_previous=mode == 'test',
                scope='a_decoder')

            states_p = states_projection(
                q_states,
                self.hyperparameter('hidden_size'),
                activate_fn=tf.nn.tanh,
                dtype=tf.float32)

            if 'attn' in self.name:  # with attention
                self.seq_dec_outputs, _ = embedding_attention_decoder(
                    self.a_dec,
                    states_p,
                    q_attn_states,
                    a_cell,
                    self._vocab_size(),
                    embedding_size,
                    output_projection=output_projection,
                    feed_previous=mode == 'test',
                    scope='seq_decoder')
            else:
                self.seq_dec_outputs, _ = embedding_rnn_decoder(
                    self.a_dec,
                    states_p,
                    a_cell,
                    self._vocab_size(),
                    embedding_size,
                    output_projection=output_projection,
                    feed_previous=mode == 'test',
                    scope='seq_decoder')

    def _build_optimize_ops(self):
        super(AutoEncoderModel, self)._build_optimize_ops()

        self.q_loss_op = tf.contrib.legacy_seq2seq.sequence_loss(
            self.q_dec_outputs,
            self.q_target,
            self.q_weights,
            self._vocab_size(),
            softmax_loss_function=self._sampled_softmax_fn if self.output_projection else None)
        self.a_loss_op = tf.contrib.legacy_seq2seq.sequence_loss(
            self.a_dec_outputs,
            self.a_target,
            self.a_weights,
            self._vocab_size(),
            softmax_loss_function=self._sampled_softmax_fn if self.output_projection else None)

        self.q_train_op = self.optimizer.minimize(self.q_loss_op)
        self.a_train_op = self.optimizer.minimize(self.a_loss_op)

    def _train_step(self, batch: Batch):
        feed_dict = {}
        for i in range(self.enc_length):
            feed_dict[self.q_enc[i]] = batch.q_enc_seq[i]
            feed_dict[self.a_enc[i]] = batch.a_enc_seq[i]
        for i in range(self.dec_length):
            feed_dict[self.q_dec[i]] = batch.q_dec_seq[i]
            feed_dict[self.q_target[i]] = batch.q_target_seq[i]
            feed_dict[self.q_weights[i]] = batch.q_weights[i]
            feed_dict[self.a_dec[i]] = batch.a_dec_seq[i]
            feed_dict[self.a_target[i]] = batch.a_target_seq[i]
            feed_dict[self.a_weights[i]] = batch.a_weights[i]

        _, q_loss, _, a_loss = self.sess.run(
            [self.q_train_op, self.q_loss_op, self.a_train_op, self.a_loss_op],
            feed_dict=feed_dict)

        _, loss = self.sess.run([self.seq_train_op, self.seq_loss_op], feed_dict=feed_dict)

        # Print training status
        if self.global_step % self.training_parameter('print_interval') == 0:
            perplexity = math.exp(float(loss) if loss < 300 else float('inf'))
            tqdm.write(
                '----- Step %d -- Q Loss %.2f -- A Loss %.2f -- Loss %.2f -- Perplexity %.2f' % (
                    self.global_step, q_loss, a_loss, loss, perplexity))


base.py

In [55]:
from mlbootstrap.model import BasicModel
from model.batch import Batch
from typing import List, Tuple
from process.vocab_processor import VocabularyProcessor, GO_TOKEN, EOS_TOKEN, PAD_TOKEN
import random
import tensorflow as tf
import datetime
from tqdm import tqdm
import yaml
import os

In [56]:
_MODEL_STATUS_FILENAME = 'model_status.yaml'


class BasicChatbotModel(BasicModel):
    def __init__(self, name: str = 'basic_chatbot_model'):
        super(BasicChatbotModel, self).__init__()

        self.enc_length = None
        self.dec_length = None

        self.sess: tf.Session = None
        self.saver: tf.train.Saver = None

        self.global_step = 0

    def train(self):
        self._restore_model_settings()
        self._build_graph('train')
        self._create_session()
        self._restore_checkpoint()

        print('Start training ...')
        epoch = self.training_parameter('epoch')

        for e in range(1, epoch + 1):
            print()
            learning_rate = self.training_parameter('learning_rate')
            print(
                '----- Epoch {}/{} ; (learning_rate={}) -----'.format(e, epoch, learning_rate))

            tic = datetime.datetime.now()
            batches = self._get_batches('train')
            for batch in tqdm(batches, desc='Training'):
                self.global_step += 1
                self._train_step(batch)

                if self.global_step % self.training_parameter('save_interval') == 0:
                    self._save_checkpoint()

            toc = datetime.datetime.now()
            print('Epoch finished in {}'.format(toc - tic))

    def _restore_model_settings(self):
        path = os.path.join(self._config['model']['save_path'], self.name)
        os.makedirs(path, exist_ok=True)
        status_path = os.path.join(path, _MODEL_STATUS_FILENAME)

        if os.path.exists(status_path):
            with open(status_path, 'r') as stream:
                model_status = yaml.load(stream)
                self.global_step = model_status['global_step']
                self._config['hyperparameter'] = model_status['hyperparameter']

        max_sent_length = self.hyperparameter('max_sent_length')
        self.enc_length = max_sent_length
        self.dec_length = max_sent_length + 2

    def _build_graph(self, mode: str):
        raise NotImplementedError

    def _create_session(self):
        self.sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)
        ))
        self.saver = tf.train.Saver()

    def _restore_checkpoint(self):
        path = os.path.join(self._config['model']['save_path'], self.name)
        os.makedirs(path, exist_ok=True)
        status_path = os.path.join(path, _MODEL_STATUS_FILENAME)

        if os.path.exists(status_path):
            ckpt = tf.train.get_checkpoint_state(path)
            if ckpt and ckpt.model_checkpoint_path:
                self.saver.restore(self.sess, ckpt.model_checkpoint_path)
            else:
                print('No checkpoint found')
                exit(1)
        else:
            self.sess.run(tf.global_variables_initializer())

    def _save_checkpoint(self):
        tqdm.write("Checkpoint reached: saving model (don't stop the run) ...")

        path = os.path.join(self._config['model']['save_path'], self.name)
        os.makedirs(path, exist_ok=True)
        self._save_model_status(path)

        model_path = os.path.join(path, 'model')
        self.saver.save(self.sess, model_path, global_step=self.global_step)
        tqdm.write('Model saved.')

    def _save_model_status(self, path: str):
        model_status = {
            'hyperparameter': self._config['hyperparameter'],
            'global_step': self.global_step}
        status_path = os.path.join(path, _MODEL_STATUS_FILENAME)
        with open(status_path, 'w') as stream:
            yaml.dump(model_status, stream)

    def _train_step(self, batch: Batch):
        raise NotImplementedError

    def _vocab_size(self):
        return self.dataset['vocab_processor'].size()

    def _get_batches(self, mode: str) -> List[Batch]:
        if mode == 'train':
            random.shuffle(self.dataset[mode])
        samples = self.dataset[mode]
        batch_size = self.training_parameter('batch_size')
        samples_list = [samples[i:min(i + batch_size, len(samples))] for i in
                        range(0, len(samples), batch_size)]
        return [self._create_batch(samples) for samples in samples_list]

    def _create_batch(self, samples: List[Tuple[List, List]]) -> Batch:
        batch = Batch()
        vocab_processor: VocabularyProcessor = self.dataset['vocab_processor']
        go_id = vocab_processor.word2id(GO_TOKEN)
        eos_id = vocab_processor.word2id(EOS_TOKEN)
        pad_id = vocab_processor.word2id(PAD_TOKEN)

        for q, a in samples:
            def __pad(arr, pad_token, length, from_left=False):
                if len(arr) >= length:
                    return arr
                padding = [pad_token] * (length - len(arr))
                if from_left:
                    arr = padding + arr
                else:
                    arr = arr + padding
                return arr

            q_enc_seq = __pad(list(reversed(q)), pad_id, self.enc_length, from_left=True)
            batch.q_enc_seq.append(q_enc_seq)

            q_dec_seq = __pad([go_id] + q + [eos_id], pad_id, self.dec_length)
            batch.q_dec_seq.append(q_dec_seq)

            q_target_seq = __pad(q + [eos_id], pad_id, self.dec_length)
            batch.q_target_seq.append(q_target_seq)

            q_weights = __pad([1.0] * len(q), 0.0, self.dec_length)
            batch.q_weights.append(q_weights)

            a_enc_seq = __pad(list(reversed(a)), pad_id, self.enc_length, from_left=True)
            batch.a_enc_seq.append(a_enc_seq)

            a_dec_seq = __pad([go_id] + a + [eos_id], pad_id, self.dec_length)
            batch.a_dec_seq.append(a_dec_seq)

            a_target_seq = __pad(a + [eos_id], pad_id, self.dec_length)
            batch.a_target_seq.append(a_target_seq)

            a_weights = __pad([1.0] * len(a), 0.0, self.dec_length)
            batch.a_weights.append(a_weights)

        def __transpose(arr):
            len1 = len(arr)
            len2 = len(arr[0])
            result = []
            for _i in range(len2):
                next_arr = []
                for _j in range(len1):
                    next_arr.append(arr[_j][_i])
                result.append(next_arr)
            return result

        batch.q_enc_seq = __transpose(batch.q_enc_seq)
        batch.q_dec_seq = __transpose(batch.q_dec_seq)
        batch.q_target_seq = __transpose(batch.q_target_seq)
        batch.q_weights = __transpose(batch.q_weights)
        batch.a_enc_seq = __transpose(batch.a_enc_seq)
        batch.a_dec_seq = __transpose(batch.a_dec_seq)
        batch.a_target_seq = __transpose(batch.a_target_seq)
        batch.a_weights = __transpose(batch.a_weights)

        return batch


batch.py

In [57]:
class Batch:
    def __init__(self):
        self.q_enc_seq = []
        self.q_dec_seq = []
        self.q_target_seq = []
        self.q_weights = []
        self.a_enc_seq = []
        self.a_dec_seq = []
        self.a_target_seq = []
        self.a_weights = []

seq.py

In [58]:
from model.base import BasicChatbotModel
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import math
from model.batch import Batch
from typing import List
from process.vocab_processor import VocabularyProcessor, GO_TOKEN, EOS_TOKEN, PAD_TOKEN
import string
import os
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json

In [59]:
class Projection:
    def __init__(self, shape: List[int], scope=None, dtype=None):
        self.scope = scope
        assert len(shape) == 2
        with tf.variable_scope(scope, dtype=dtype):
            self.W = tf.get_variable('weight', shape)
            self.b = tf.get_variable('bias', shape[1:], initializer=tf.constant_initializer())

    def variables(self):
        return self.W, self.b

    def __call__(self, x):
        with tf.name_scope(self.scope):
            return x @ self.W + self.b


class Seq2SeqModel(BasicChatbotModel):
    def __init__(self, name: str = 'seq'):
        super(Seq2SeqModel, self).__init__()
        self.q_enc = None
        self.a_dec = None
        self.a_target = None
        self.a_weights = None

        self.seq_dec_outputs = None

        self.output_projection: Projection = None

        self.outputs = None

        self.optimizer = None

        self.seq_loss_op = None
        self.seq_train_op = None

    def _build_graph(self, mode: str):
        print('Building graph ...')
        self._build_placeholders()
        self._build_seq2seq(mode)
        if mode == 'train':
            self._build_optimize_ops()
        else:
            self._build_outputs_op()

    def _build_placeholders(self):
        with tf.name_scope('placeholder'):
            with tf.name_scope('q'):
                self.q_enc = [tf.placeholder(tf.int32, [None, ], name='encoders') for _ in
                              range(self.enc_length)]
            with tf.name_scope('a'):
                self.a_dec = [tf.placeholder(tf.int32, [None, ], name='decoders') for _ in
                              range(self.dec_length)]
                self.a_target = [tf.placeholder(tf.int32, [None, ], name='target') for _ in
                                 range(self.dec_length)]
                self.a_weights = [tf.placeholder(tf.float32, [None, ], name='weights') for _ in
                                  range(self.dec_length)]

    def _build_seq2seq(self, mode: str):
        with tf.name_scope('seq2seq'):
            if 0 < self.hyperparameter('n_sampled') < self._vocab_size():
                vocab_size = self._vocab_size()
                hidden_size = self.hyperparameter('hidden_size')
                self.output_projection = Projection(
                    [hidden_size, vocab_size], scope='output_projection', dtype=tf.float32)

            n_layers = self.hyperparameter('n_layers')
            cell = tf.contrib.rnn.MultiRNNCell(
                [self._create_rnn_cell(mode) for _ in range(n_layers)])

            embedding_size = self.hyperparameter('embedding_size')
            output_projection = self.output_projection.variables() if \
                self.output_projection else None

            self.seq_dec_outputs, _ = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
                self.q_enc,
                self.a_dec,
                cell,
                self._vocab_size(),
                self._vocab_size(),
                embedding_size=embedding_size,
                output_projection=output_projection,
                feed_previous=mode == 'test')

    def _create_rnn_cell(self, mode: str):
        hidden_size = self.hyperparameter('hidden_size')
        cell = tf.contrib.rnn.BasicLSTMCell(hidden_size)

        if mode == 'train':
            dropout_keep_prob = self.hyperparameter('dropout')
            cell = tf.contrib.rnn.DropoutWrapper(
                cell,
                input_keep_prob=1.0,
                output_keep_prob=dropout_keep_prob)

        return cell

    def _build_optimize_ops(self):
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.training_parameter('learning_rate'))

        self.seq_loss_op = tf.contrib.legacy_seq2seq.sequence_loss(
            self.seq_dec_outputs,
            self.a_target,
            self.a_weights,
            self._vocab_size(),
            softmax_loss_function=self._sampled_softmax_fn if self.output_projection else None
        )
        self.seq_train_op = self.optimizer.minimize(self.seq_loss_op)

    def _build_outputs_op(self):
        if not self.output_projection:
            self.outputs = self.seq_dec_outputs
        else:
            self.outputs = [self.output_projection(o) for o in self.seq_dec_outputs]

        self.outputs = [tf.argmax(o, axis=1) for o in self.outputs]

    def _sampled_softmax_fn(self, labels, logits):
        labels = tf.reshape(labels, [-1, 1])
        W_t = tf.transpose(self.output_projection.W)
        logits = tf.cast(logits, tf.float32)

        return tf.nn.sampled_softmax_loss(
            W_t,
            self.output_projection.b,
            labels,
            logits,
            self.hyperparameter('n_sampled'),
            self._vocab_size())

    def _train_step(self, batch: Batch):
        feed_dict = {}
        for i in range(self.enc_length):
            feed_dict[self.q_enc[i]] = batch.q_enc_seq[i]
        for i in range(self.dec_length):
            feed_dict[self.a_dec[i]] = batch.a_dec_seq[i]
            feed_dict[self.a_target[i]] = batch.a_target_seq[i]
            feed_dict[self.a_weights[i]] = batch.a_weights[i]

        _, loss = self.sess.run([self.seq_train_op, self.seq_loss_op], feed_dict=feed_dict)

        # Print training status
        if self.global_step % self.training_parameter('print_interval') == 0:
            perplexity = math.exp(float(loss) if loss < 300 else float('inf'))
            tqdm.write('----- Step %d -- Loss %.2f -- Perplexity %.2f' % (
                self.global_step, loss, perplexity))

    def evaluate(self):
        self._restore_model_settings()
        self._build_graph('test')
        self._create_session()
        self._restore_checkpoint()

        print('Start testing ...')
        batches = self._get_batches('test')
        test_samples = [qa for qa, _ in self.dataset['test_samples']]
        test_samples_text = [text for _, text in self.dataset['test_samples']]
        # batches = [self._create_batch(test_samples)]
        all_inputs = []
        all_outputs = []
        all_references = []

        for batch in tqdm(batches, desc='Testing'):
            feed_dict = {}
            for i in range(self.enc_length):
                feed_dict[self.q_enc[i]] = batch.q_enc_seq[i]
            feed_dict[self.a_dec[0]] = batch.a_dec_seq[0]

            [outputs] = self.sess.run([self.outputs], feed_dict=feed_dict)
            all_outputs += np.transpose(np.array(outputs)).tolist()
            all_inputs += np.transpose(np.array(batch.q_enc_seq)).tolist()
            all_references += np.transpose(np.array(batch.a_target_seq)).tolist()

        # self._write_test_samples_literal(test_samples_text, all_outputs)
        # self._write_test_samples_results(all_outputs)
        # self._wirte_test_literal(all_inputs, all_outputs)
        self._write_evaluation_results(all_outputs, all_references)

    def _write_test_samples_results(self, outputs: List[List[int]]):
        results = {}
        outputs = [self._ids2tokens(tokens) for tokens in outputs]

        grams = {1: set(), 2: set(), 3: set()}
        for g in grams:
            for tokens in outputs:
                for i in range(len(tokens)):
                    if i + g >= len(tokens):
                        break
                    grams[g].add(tuple(tokens[i:i + g]))
        results.update(dict(('{}-gram'.format(n), len(v)) for n, v in grams.items()))

        path = os.path.join(self._config['model']['save_path'], self.name)
        result_path = os.path.join(path, 'test_samples_results.json')
        with open(result_path, 'w') as f:
            json.dump(results, f, indent=2, sort_keys=True)

    def _write_evaluation_results(self, outputs: List[List[int]], references: List[List[int]] = None):
        results = {}
        outputs = [self._ids2tokens(tokens) for tokens in outputs]
        references = [self._ids2tokens(tokens) for tokens in references]

        weights = [
            (1, 0, 0, 0),
            (0.5, 0.5, 0, 0),
            (0.33, 0.33, 0.33, 0),
            (0.25, 0.25, 0.25, 0.25)
        ]

        smoothing_fn = SmoothingFunction().method1
        pairs = tqdm([i for i in zip(outputs, references)], desc='Computing BLEU score')
        scores = [np.average(
            [sentence_bleu([ref], pred, weights=w, smoothing_function=smoothing_fn) for pred, ref in pairs]) for w in weights]
        for i, score in enumerate(scores):
            results['BLEU-{}'.format(i + 1)] = score

        grams = {1: set(), 2: set(), 3: set()}
        for g in grams:
            for tokens in outputs:
                for i in range(len(tokens)):
                    if i + g >= len(tokens):
                        break
                    grams[g].add(tuple(tokens[i:i + g]))
        results.update(dict(('{}-gram'.format(n), len(v)) for n, v in grams.items()))

        path = os.path.join(self._config['model']['save_path'], self.name)
        result_path = os.path.join(path, 'evaluation_results.json')
        with open(result_path, 'w') as f:
            json.dump(results, f, indent=2, sort_keys=True)

    def _wirte_test_literal(self, inputs: List[List[int]], outputs: List[List[int]]):
        inputs = [self._ids2tokens(tokens, reverse=True) for tokens in inputs]
        outputs = [self._ids2tokens(tokens) for tokens in outputs]
        inputs = [self._tokens2literal(sent) for sent in inputs]
        outputs = [self._tokens2literal(sent) for sent in outputs]

        path = os.path.join(self._config['model']['save_path'], self.name)
        test_literal_path = os.path.join(path, 'test_samples.txt')
        with open(test_literal_path, 'w') as f:
            for q, a in zip(inputs, outputs):
                f.write(q + ' +++$+++ ' + a + '\n')

    def _write_test_samples_literal(self, text: List[str], outputs: List[List[int]]):
        outputs = [self._ids2tokens(tokens) for tokens in outputs]
        outputs = [self._tokens2literal(sent) for sent in outputs]

        path = os.path.join(self._config['model']['save_path'], self.name)
        test_literal_path = os.path.join(path, 'example_questions.txt')
        with open(test_literal_path, 'w') as f:
            for q, a in zip(text, outputs):
                f.write(q + ' +++$+++ ' + a + '\n')

    def _ids2tokens(self, seq: List[int], reverse=False):
        if not seq:
            return ''

        if reverse:
            seq = reversed(seq)

        vocab_processor: VocabularyProcessor = self.dataset['vocab_processor']
        sent = []
        for word_id in seq:
            word = vocab_processor.id2word(word_id)
            if word == EOS_TOKEN:
                break
            elif word not in [GO_TOKEN, PAD_TOKEN]:
                sent.append(word)

        return sent

    @staticmethod
    def _tokens2literal(sent: List[str]):
        text = ''.join(
            [' ' + t if not t.startswith("'") and t not in string.punctuation else t for t in sent])
        return text.strip().capitalize()

seq_attn.py

In [60]:
from model.seq import Seq2SeqModel
import tensorflow as tf

In [61]:
class Seq2SeqAttentionModel(Seq2SeqModel):
    def __init__(self, name: str = 'seq-attn'):
        super(Seq2SeqAttentionModel, self).__init__(name)

    def _build_seq2seq(self, mode: str):
        super(Seq2SeqAttentionModel, self)._build_seq2seq(mode)

        with tf.name_scope('seq2seq'):
            n_layers = self.hyperparameter('n_layers')
            cell = tf.contrib.rnn.MultiRNNCell(
                [self._create_rnn_cell(mode) for _ in range(n_layers)])

            embedding_size = self.hyperparameter('embedding_size')
            output_projection = self.output_projection.variables() if \
                self.output_projection else None

            self.seq_dec_outputs, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                self.q_enc,
                self.a_dec,
                cell,
                self._vocab_size(),
                self._vocab_size(),
                embedding_size,
                output_projection=output_projection,
                feed_previous=mode == 'test')

tf_rnn_helper.py

In [62]:
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import rnn
from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import \
    embedding_rnn_decoder as tf_embedding_rnn_decoder
from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import \
    embedding_attention_decoder as tf_embedding_attention_decoder
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMStateTuple
from tensorflow.python.ops import array_ops
import copy

In [63]:
def embedding_rnn_encoder(encoder_inputs,
                          cell,
                          num_symbols,
                          embedding_size,
                          scope=None,
                          dtype=None):
    with variable_scope.variable_scope(scope or "embedding_rnn_encoder", dtype=dtype) as scope:
        dtype = scope.dtype

        # Note that we use a deep copy of the original cell
        encoder_cell = copy.deepcopy(cell)
        encoder_cell = core_rnn_cell.EmbeddingWrapper(
            encoder_cell,
            embedding_classes=num_symbols,
            embedding_size=embedding_size)
        encoder_outputs, encoder_state = rnn.static_rnn(
            encoder_cell, encoder_inputs, dtype=dtype)

        top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs]
        attention_states = array_ops.concat(top_states, 1)

        return encoder_state, attention_states

In [64]:
def embedding_rnn_decoder(decoder_inputs,
                          initial_state,
                          cell,
                          num_symbols,
                          embedding_size,
                          output_projection=None,
                          feed_previous=False,
                          scope=None):
    with variable_scope.variable_scope(scope or "embedding_rnn_decoder"):
        # Node that we use the original cell
        if output_projection is None:
            cell = core_rnn_cell.OutputProjectionWrapper(cell, num_symbols)

        return tf_embedding_rnn_decoder(
            decoder_inputs,
            initial_state,
            cell,
            num_symbols,
            embedding_size,
            output_projection=output_projection,
            feed_previous=feed_previous)


In [65]:
def embedding_attention_decoder(decoder_inputs,
                                initial_state,
                                attention_states,
                                cell,
                                num_symbols,
                                embedding_size,
                                num_heads=1,
                                output_size=None,
                                output_projection=None,
                                feed_previous=False,
                                scope=None,
                                initial_state_attention=False):
    with variable_scope.variable_scope(scope or "embedding_attention_decoder"):
        if output_projection is None:
            cell = core_rnn_cell.OutputProjectionWrapper(cell, num_symbols)
            output_size = num_symbols

        return tf_embedding_attention_decoder(
            decoder_inputs,
            initial_state,
            attention_states,
            cell,
            num_symbols,
            embedding_size,
            num_heads=num_heads,
            output_size=output_size,
            output_projection=output_projection,
            feed_previous=feed_previous,
            initial_state_attention=initial_state_attention)



In [66]:
def create_projection_params(size, dtype=None, scope=None):
    with variable_scope.variable_scope(scope or "projection_params") as scope:
        if dtype is not None:
            scope.set_dtype(dtype)
        else:
            dtype = scope.dtype

        p_w = tf.get_variable('weights', (size, size), dtype=dtype)
        p_b = tf.get_variable('bias', (size,), dtype=dtype)

        return p_w, p_b
def states_projection(states,
                      hidden_size,
                      activate_fn=None,
                      dtype=None,
                      scope=None):
    with variable_scope.variable_scope(scope or "states_projection") as scope:
        if dtype is not None:
            scope.set_dtype(dtype)
        else:
            dtype = scope.dtype

        def state_projection(state, _scope=None):
            with variable_scope.variable_scope(_scope):
                if isinstance(state, LSTMStateTuple):
                    c_w, c_b = create_projection_params(hidden_size, dtype=dtype, scope='c')
                    c = tf.nn.xw_plus_b(state.c, c_w, c_b, name='c')
                    h_w, h_b = create_projection_params(hidden_size, dtype=dtype, scope='h')
                    h = tf.nn.xw_plus_b(state.h, h_w, h_b, name='h')
                    if activate_fn:
                        c = activate_fn(c)
                        h = activate_fn(h)
                    return LSTMStateTuple(c, h)
                else:
                    p_w, p_b = create_projection_params(hidden_size, dtype=dtype)
                    p = tf.nn.xw_plus_b(state, p_w, p_b)
                    if activate_fn:
                        p = activate_fn(p)
                    return p

        if type(states) == tuple:
            return tuple(
                state_projection(state, 'layer_{}'.format(i)) for i, state in enumerate(states))
        else:
            return state_projection(states)

play.py

In [68]:
from mlbootstrap import Bootstrap
from fetch.fetch import Fetcher
from process.process import DataProcessor
from model.seq import Seq2SeqModel
from model.seq_attn import Seq2SeqAttentionModel
from model.auto_encoder import AutoEncoderModel

models = {
    'seq': Seq2SeqModel(),
    'seq-attn': Seq2SeqAttentionModel(),
    'auto': AutoEncoderModel('auto'),
    'auto-attn': AutoEncoderModel('auto-attn')
}

bootstrap = Bootstrap(
    'config.yaml',
    fetcher=Fetcher(),
    preprocessor=DataProcessor(),
    models=models
)



In [69]:
bootstrap.train()

NameError: name 'task' is not defined