# Framework Test

In [1]:
import random
import numpy as np
from tflearn.data_utils import pad_sequences
from collections import Counter
import os
import pickle
import argparse
import tensorflow as tf

ImportError: No module named 'tflearn'

In [2]:
import normalizers
import tokenizers
import vectorizers
import models

In [3]:
class Config:
    def __init__(self):
        self.mode = 'train'
        self.small = False
        self.train_dir = "data/ilbe/train.txt"
        self.val_dir = "data/ilbe/test.txt"
        self.pretrained_embed_dir = ""
        self.model = "TextCNN"
        self.normalizer = "BasicNormalizer"
        self.tokenizer = "JamoTokenizer"
        self.vocab_size = 20000
        self.embed_dim = 128
        self.min_length = 64
        self.max_length = 512
        self.filter_sizes = "3,4,5"
        self.num_filters = 128
        self.dropout_keep_prob = 0.5
        self.l2_reg_lambda = 0.0
        self.batch_size = 64
        self.num_epochs = 200
        self.evaluate_every = 1
        self.checkpoint_every = 1
        self.allow_soft_replacement = True
        self.log_device_placement = False
        self.shuffle = False
        self.checkpoint_dir = ""
        self.num_checkpoints = 30

In [4]:
config = Config()

In [5]:
from models import TextCNN

In [6]:
Model = eval("{}.{}".format(config.model, config.model))

In [10]:
Model = eval("TextCNN")

In [11]:
Model

<module 'models.TextCNN' from '/home/angrypark/korean-text-classification-tf/models/TextCNN.py'>

In [7]:
Normalizer = getattr(normalizers, config.normalizer)
Tokenizer = getattr(tokenizers, config.tokenizer)
normalizer = Normalizer(config)
tokenizer = Tokenizer(config)
vectorizer = vectorizers.Vectorizer(tokenizer, config)

# Make Dataset

비율 유지

# Train

In [25]:
train_labels.shape[1]

3

In [8]:
from data_helper import Preprocessor, load_data, split_data, batch_iter

In [9]:
train_set, val_set = load_data(config.train_dir, config.val_dir, small=config.small)

In [10]:
train_data, train_labels = split_data(train_set)
val_data, val_labels = split_data(val_set)

In [11]:
vectorizer.build_vectorizer(train_data)

Total number of unique tokens :  1662


In [12]:
preprocessor = Preprocessor(config, normalizer, tokenizer, vectorizer)

In [13]:
preprocessor.preprocess(val_data)

array([[ 187,    4,  368, ...,    0,    0,    0],
       [1597,  139, 1128, ...,    0,    0,    0],
       [1590,    4, 1096, ...,    0,    0,    0],
       ...,
       [1364, 1364, 1364, ...,    0,    0,    0],
       [1128,  548,  135, ...,    0,    0,    0],
       [1609,  882, 1514, ...,    0,    0,    0]], dtype=int32)

In [14]:
if config.model == 'TextCNN':
    Model = TextCNN.TextCNN

In [15]:
def train_step(x_batch, y_batch):
        """
        A single training step
        """
        feed_dict = {model.input_x: x_batch,
                     model.input_y: y_batch,
                     model.dropout_keep_prob: config.dropout_keep_prob}
        _, step, summaries, loss, accuracy = sess.run([train_op, global_step, train_summary_op, model.loss,
                                                       model.accuracy],
                                                      feed_dict=feed_dict)
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
        train_summary_writer.add_summary(summaries, step)

def val_step(x_batch, y_batch, writer=None):
    """
    Evaluates model on a validation set
    """
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.dropout_keep_prob: 1.0
    }
    step, summaries, loss, accuracy = sess.run([global_step, val_summary_op, model.loss, model.accuracy],
                                               feed_dict=feed_dict)
    time_str = datetime.datetime.now().isoformat()
    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    if writer:
        writer.add_summary(summaries, step)

def load_pretrained_embedding(shape, pretrained_embed_path, trainable=True):
    print("Loading pre-trained word embedding from {}".format(pretrained_embed_path))
    embedding_matrix = np.load(pretrained_embed_path)
    initializer = tf.constant_initializer(embedding_matrix.astype(np.float32))
    with tf.variable_scope("embedding"):
        embedding = tf.get_variable(name='pretrained_embedding',shape=shape, initializer=initializer, trainable=trainable)
    return embedding

In [16]:
import tensorflow as tf

In [17]:
# Set device setting
device_config = tf.ConfigProto()
device_config.allow_soft_placement = config.allow_soft_replacement
device_config.log_device_placement = config.log_device_placement
device_config.gpu_options.allow_growth = True

In [18]:
import time

In [19]:
train_data = preprocessor.preprocess(train_data)
val_data = preprocessor.preprocess(val_data)

In [20]:
train_labels[0]

array([1., 0., 0.])

In [21]:
import datetime

In [23]:
# Training
# ==================================================
with tf.Session(config=device_config) as sess:
    # Create model
    num_classes = len(train_labels[0])
    model = Model(config, num_classes)

    # Create Saver
    saver = tf.train.Saver()
    if os.path.exists(config.checkpoint_dir + "checkpoint"):
        print("Restoring variables from checkpoint : {}".format(config.checkpoint_dir))
        saver.restore(sess, tf.train.latest_checkpoint(config.checkpoint_dir))
    else:
        print("Initializing Variables")
        sess.run(tf.global_variables_initializer())
        if config.pretrained_embed_dir:
            embedding = load_pretrained_embedding(sess, config.pretrained_model_dir)
            tf.assign(model.Embedding, embedding)

    # Define training procedure
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    grads_and_vars = optimizer.compute_gradients(model.loss)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

    # Keep track of gradient values and sparsity
    grad_summaries = list()
    for g, v in grads_and_vars:
        if g is not None:
            grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
            sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
            grad_summaries.append(grad_hist_summary)
            grad_summaries.append(sparsity_summary)
    grad_summaries_merged = tf.summary.merge(grad_summaries)

    # Output directory for models and summaries
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))

    # Loss and accuracy summaries
    loss_summary = tf.summary.scalar("loss", model.loss)
    acc_summary = tf.summary.scalar("accuracy", model.accuracy)

    # Train summaries
    train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

    # Validation summaries
    val_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
    val_summary_dir = os.path.join(out_dir, "summaries", "val")
    val_summary_writed = tf.summary.FileWriter(val_summary_dir, sess.graph)

    # Checkpoint directory
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=config.num_checkpoints)

    # Generate batches
    batches = batch_iter(list(zip(train_data, train_labels)), config.batch_size, config.num_epochs, config.shuffle)

    sess.run(tf.initialize_all_variables())

    # Training loop for each batch
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        train_step(x_batch, y_batch)
        current_step = tf.train.global_step(sess, global_step)
        if current_step % config.evaluate_every == 0:
            print("\nEvaluation : ")
            val_step(val_data, val_labels)
            print("")
        if current_step % config.checkpoint_every == 0:
            path = saver.save(sess, checkpoint_prefix, global_step=current_step)
            print("Saved model checkpoint to {}\n".format(path))
    

Initializing Variables
INFO:tensorflow:Summary name embedding/W:0/grad/hist is illegal; using embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name embedding/W:0/grad/sparsity is illegal; using embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/hist is illegal; using conv-maxpool-4/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/sparsity is illegal; using conv-maxpool-4/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/b:0/grad/

KeyboardInterrupt: 

In [49]:
batches = batch_iter(list(zip(train_data, train_labels)), config.batch_size, config.num_epochs, False)

In [57]:
from collections import Counter
Counter([np.argmax(label) for label in train_labels])

Counter({0: 5500, 1: 1071, 2: 1030})

In [59]:
5500/(5500+1071+1030)

0.723589001447178