In [6]:
import tensorflow as tf

In [7]:
class CharCNNEmbedding:
    def __init__(self, config):
        self.char_vocab_size = config["char_vocab_size"]
        self.char_embedding_dim = config["char_embedding_dim"]

        self.kernel_sizes = config["kernel_sizes"]
        self.filter_size = config["elmo_hidden"] // len(self.kernel_sizes)

        self.seq_len = config["word_seq_len"]
        self.char_seq_len = config["char_seq_len"]
        
        with tf.variable_scope("char_cnn", reuse=tf.AUTO_REUSE):
            self.conv_filters = [
                tf.layers.Conv1D(self.filter_size, kernel_size)
                for kernel_size in self.kernel_sizes
            ]

        with tf.variable_scope("char_embedding", reuse=tf.AUTO_REUSE):
            self.embedding_weight = tf.get_variable("embedding_weight", 
                                        [self.char_vocab_size, self.char_embedding_dim],
                                        dtype=tf.float32)
            
            
    def forward(self, data):
        embed_input = tf.nn.embedding_lookup(self.embedding_weight, data["input"])

        conv_outputs = []
        conv_input = tf.reshape(embed_input, [-1, self.char_seq_len, self.char_embedding_dim])
        for conv, kernel_size in zip(self.conv_filters, self.kernel_sizes):
            conv_output = conv(conv_input)
            _conv_output = tf.reshape(conv_output, [-1, self.seq_len, conv_output.shape[1], self.filter_size])

            pool_output = tf.nn.max_pool(_conv_output, [1, 1, conv_output.shape[1], 1], [1, 1, 1, 1], 'VALID')
            pool_output = tf.squeeze(pool_output, axis=2)
            conv_outputs.append(pool_output)

        # shape = (batch_size, seq_len, embedding_dim)
        char_word_embedding = tf.concat(conv_outputs, axis=2)
        return char_word_embedding

In [17]:
class ELMO:
    def __init__(self, config):
        self.embedding = CharCNNEmbedding(config)
        self.hidden_size = config["elmo_hidden"]
        self.vocab_size = config["word_vocab_size"]
        self.seq_len = config["word_seq_len"]
        self.config = config
        
        with tf.variable_scope("elmo_rnn_cell"):
            self.forward_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, reuse=tf.AUTO_REUSE)
            self.backward_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, reuse=tf.AUTO_REUSE)
        
        #是否将输入concat到输出
        if config.get("use_skip_connection"):
            self.forward_cell = tf.nn.rnn_cell.ResidualWrapper(self.forward_cell)
            self.backward_cell = tf.nn.rnn_cell.ResidualWrapper(self.backward_cell)
            
        with tf.variable_scope("elmo_softmax"):
            softmax_weight_shape = [config["word_vocab_size"], config["elmo_hidden"]]

            self.forward_softmax_w = tf.get_variable("forward_softmax_w", softmax_weight_shape, dtype=tf.float32)
            self.backward_softmax_w = tf.get_variable("backward_softmax_w", softmax_weight_shape, dtype=tf.float32)

            self.forward_softmax_b = tf.get_variable("forward_softmax_b", [config["word_vocab_size"]])
            self.backward_softmax_b = tf.get_variable("backward_softmax_b", [config["word_vocab_size"]])
            
    def forward(self, data):
        embedding_output = self.embedding.forward(data)
        with tf.variable_scope("elmo_rnn_forward"):
            forward_outputs, forward_states = tf.nn.dynamic_rnn(self.forward_cell,
                                                                inputs=embedding_output,
                                                                sequence_length=data["input_len"],
                                                                dtype=tf.float32)

        with tf.variable_scope("elmo_rnn_backward"):
            backward_outputs, backward_states = tf.nn.dynamic_rnn(self.backward_cell,
                                                                  inputs=embedding_output,
                                                                  sequence_length=data["input_len"],
                                                                  dtype=tf.float32)

        # # Concatenate the forward and backward LSTM output
        forward_projection = tf.matmul(forward_outputs, tf.expand_dims(tf.transpose(self.forward_softmax_w), 0))
        forward_projection = tf.nn.bias_add(forward_projection, self.forward_softmax_b)

        backward_projection = tf.matmul(backward_outputs, tf.expand_dims(tf.transpose(self.backward_softmax_w), 0))
        backward_projection = tf.nn.bias_add(backward_projection, self.backward_softmax_b)

        return forward_outputs, backward_outputs, forward_projection, backward_projection
    
    
    def train(self, data, global_step_variable=None):
        forward_output, backward_output, _, _ = self.forward(data)

        forward_target = data["target"]
        forward_pred = tf.cast(tf.argmax(tf.nn.softmax(forward_output, -1), -1), tf.int32)
        forward_correct = tf.equal(forward_pred, forward_target)
        forward_padding = tf.sequence_mask(data["target_len"], maxlen=self.seq_len, dtype=tf.float32)

        forward_softmax_target = tf.cast(tf.reshape(forward_target, [-1, 1]), tf.int64)
        forward_softmax_input = tf.reshape(forward_output, [-1, self.hidden_size])
        forward_train_loss = tf.nn.sampled_softmax_loss(
            weights=self.forward_softmax_w, biases=self.forward_softmax_b,
            labels=forward_softmax_target, inputs=forward_softmax_input,
            num_sampled=self.config["softmax_sample_size"],
            num_classes=self.config["word_vocab_size"]
        )

        forward_train_loss = tf.reshape(forward_train_loss, [-1, self.seq_len])
        forward_train_loss = tf.multiply(forward_train_loss, forward_padding)
        forward_train_loss = tf.reduce_mean(forward_train_loss)

        backward_target = tf.reverse_sequence(data["target"], data["target_len"], seq_axis=1, batch_axis=0)
        backward_pred = tf.cast(tf.argmax(tf.nn.softmax(backward_output, -1), -1), tf.int32)
        backward_correct = tf.equal(backward_pred, backward_target)
        backward_padding = tf.sequence_mask(data["target_len"], maxlen=self.seq_len, dtype=tf.float32)

        backward_softmax_target = tf.cast(tf.reshape(backward_target, [-1, 1]), tf.int64)
        backward_softmax_input = tf.reshape(backward_output, [-1, self.hidden_size])
        backward_train_loss = tf.nn.sampled_softmax_loss(
            weights=self.backward_softmax_w, biases=self.backward_softmax_b,
            labels=backward_softmax_target, inputs=backward_softmax_input,
            num_sampled=self.config["softmax_sample_size"],
            num_classes=self.config["word_vocab_size"]
        )

        backward_train_loss = tf.reshape(backward_train_loss, [-1, self.seq_len])
        backward_train_loss = tf.multiply(backward_train_loss, backward_padding)
        backward_train_loss = tf.reduce_mean(backward_train_loss)

        train_loss = forward_train_loss + backward_train_loss
        train_correct = tf.concat([forward_correct, backward_correct], axis=-1)
        train_acc = tf.reduce_mean(tf.cast(train_correct, tf.float32))

        tf.summary.scalar("train_acc", train_acc)
        tf.summary.scalar("train_loss", train_loss)

        train_ops = tf.train.AdamOptimizer().minimize(train_loss)
        return train_loss, train_acc, train_ops

    def pred(self, data):
        elmo_projection_output = self.forward(data)
        eval_output = tf.nn.softmax(elmo_projection_output, dim=-1)
        return eval_output

In [6]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-b", "--batch_size", type=int, default=1024)
parser.add_argument("-c", "--corpus_files", nargs='+', type=str,
                    default=["data/corpus/elmo.corpus.xlarge.1.txt"])

parser.add_argument("-e", "--epochs", type=int, default=10)
parser.add_argument("--verbose_freq", type=int, default=1)

parser.add_argument("--word_vocab_path", type=str, default="data/vocab/word.90k.vocab")
parser.add_argument("--char_vocab_path", type=str, default="data/vocab/jamo.100.vocab")

parser.add_argument("--word_seq_len", type=int, default=10)
parser.add_argument("--char_seq_len", type=int, default=7)

parser.add_argument("--char_embedding_dim", type=int, default=64)
parser.add_argument("--kernel_sizes", nargs='+', type=int, default=[1, 2, 3, 4])
parser.add_argument("--filter_sizes", nargs='+', type=int, default=None)

parser.add_argument("--elmo_hidden", type=int, default=512)
parser.add_argument("--softmax_sample_size", type=int, default=8196)

parser.add_argument("--prefetch_size", type=int, default=1024)

parser.add_argument("--log_dir", type=str, default="logs/")
parser.add_argument("--save_freq", type=int, default=1000)
parser.add_argument("--model_save_path", type=str, default="output/elmo.model.test")
parser.add_argument("--log_file_prefix", type=str, default="elmo.log")
args = parser.parse_known_args()[0]
config_dict = vars(args)

print(config_dict)

{'batch_size': 1024, 'corpus_files': ['data/corpus/elmo.corpus.xlarge.1.txt'], 'epochs': 10, 'verbose_freq': 1, 'word_vocab_path': 'data/vocab/word.90k.vocab', 'char_vocab_path': 'data/vocab/jamo.100.vocab', 'word_seq_len': 10, 'char_seq_len': 7, 'char_embedding_dim': 64, 'kernel_sizes': [1, 2, 3, 4], 'filter_sizes': None, 'elmo_hidden': 512, 'softmax_sample_size': 8196, 'prefetch_size': 1024, 'log_dir': 'logs/', 'save_freq': 1000, 'model_save_path': 'output/elmo.model.test', 'log_file_prefix': 'elmo.log'}


In [13]:
from han2jamo import Han2Jamo
from vocab_builder import CharWordVocab, WordVocab


class ElmoKoreanDataset:
    def __init__(self, config):
        self.corpus_files = config["corpus_files"]
        self.jamo_processor = Han2Jamo()

        self.char_vocab = CharWordVocab.load_vocab(config["char_vocab_path"])
        self.word_vocab = WordVocab.load_vocab(config["word_vocab_path"])

        self.seq_len = config["word_seq_len"]
        self.char_seq_len = config["char_seq_len"]
        self.corpus_size = self.get_corpus_size()
        print("Dataset Size:", self.corpus_size)

        config["char_vocab_size"] = len(self.char_vocab)
        config["word_vocab_size"] = len(self.word_vocab)

    def text_to_char_sequence(self, text):
        jamo_text = self.jamo_processor.str_to_jamo(text)
        char_idx_seq, seq_len = self.char_vocab.to_seq(jamo_text,
                                                       char_seq_len=self.char_seq_len,
                                                       seq_len=self.seq_len,
                                                       with_len=True)
        seq_len = self.seq_len if seq_len > self.seq_len else seq_len
        return char_idx_seq, seq_len

    def text_to_word_sequence(self, text):
        word_idx_seq, seq_len = self.word_vocab.to_seq(text, seq_len=self.seq_len + 1, with_len=True, with_eos=True)
        seq_len = self.seq_len + 1 if seq_len > self.seq_len + 1 else seq_len
        word_idx_seq, seq_len = word_idx_seq[1:], seq_len - 1
        return word_idx_seq, seq_len

    def produce_data(self, text):
        text = text.strip()
        char_word_input, input_len = self.text_to_char_sequence(text)
        word_target, target_len = self.text_to_word_sequence(text)

        return {"input": char_word_input, "input_len": input_len,
                "target": word_target, "target_len": target_len}

    def data_generator(self):
        for file_path in self.corpus_files:
            with open(file_path, "r", encoding="utf-8") as f:
                for text in f:
                    yield self.produce_data(text)

    def get_corpus_size(self):
        count = 0
        for file_path in self.corpus_files:
            with open(file_path) as file:
                count += sum(1 for _ in file)
        return count

In [18]:
elmo = ELMO(config_dict)

NameError: name 'CharCNNEmbedding' is not defined