## 1. 不同 batch_size 模型预测时间对比

### 环境说明：

In [1]:
%load_ext watermark
%watermark -a 'Scott Ming' -v -m -d -p numpy,pandas,matplotlib,tensorflow

Scott Ming 2017-04-23 

CPython 3.6.0
IPython 6.0.0

numpy 1.12.1
pandas 0.19.2
matplotlib 2.0.0
tensorflow 1.0.1

compiler   : GCC 4.9.2
system     : Linux
release    : 3.16.0-4-amd64
machine    : x86_64
processor  : 
CPU cores  : 4
interpreter: 64bit


In [2]:
%matplotlib inline

import os
import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

from text_helpers import build_dataset

### 1. 创建数据集

In [3]:
train = np.loadtxt('data/train_data.txt', dtype=int)
test = np.loadtxt('data/test_data.txt', dtype=int)

In [4]:
x_train = train[:, :-1]
y_train = train[:, -1:].reshape((-1,))
x_test = test[:, :-1]
y_test = test[:, -1:].reshape((-1,))

### 2. 构建模型

In [5]:
sess = tf.Session()

In [6]:
vocab_size = 80000
sequence_length = x_train.shape[1]
word_embed_size = 128
data_size = train.shape[0]
batch_size = 50
num_filters = 3
num_classes = 2

In [7]:
class TextCNN(object):
    """A CNN for text classification.
    """
    def __init__(
        self, sequence_length, vocab_size, word_embed_size,
            num_classes):

        # Placeholders for input, output
        self.input_x = tf.placeholder(
            tf.int32, shape=[None, sequence_length], name='input_x')
        self.input_y = tf.placeholder(
            tf.int32, shape=[None, ], name='input_y')

        # Embedding layer
        with tf.name_scope('embedding'):
            self.W = tf.get_variable('word_embedding', [vocab_size, word_embed_size],
                                     tf.float32, tf.random_normal_initializer())
            self.embeds = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embeds_expanded = tf.expand_dims(self.embeds, -1)

        # Convolution + maxpool layer
        with tf.name_scope('conv-maxpool'):
            filter_num = 64
            window_size = 3
            filter_shape = [window_size, word_embed_size, 1, filter_num]
            W = tf.get_variable("W", filter_shape, 
                                initializer=tf.truncated_normal_initializer(stddev=0.1))
            b = tf.get_variable("b", [filter_num], initializer=tf.constant_initializer(0.0))
            conv = tf.nn.conv2d(
                self.embeds_expanded,
                W,
                strides=[1, 1, 1, 1],
                padding='VALID',
                name='conv')
            conv_hidden = tf.nn.tanh(tf.add(conv, b), name='tanh')
            pool = tf.nn.max_pool(
                conv_hidden,
                ksize=[1, sequence_length - window_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name='pool')
            pool_shape = pool.get_shape().as_list()
            # pool_shape[0] 为一个 batch 中数据的个数，即评论条数
            nodes = pool_shape[1] * pool_shape[2] * pool_shape[3]
            # 通过 tf.reshape 函数把 pool 层的输出编程一个 batch 的向量
            self.pool_flat = tf.reshape(pool, [-1, nodes])  # -1 表示尽可能的展平

        # Final scores and predictions
        with tf.name_scope('output'):
            softmax_w = tf.get_variable('softmax_w', [nodes, num_classes], 
                                tf.float32, tf.random_normal_initializer())
            softmax_b = tf.get_variable('softmax_b', [num_classes], tf.float32, 
                                tf.constant_initializer(0.0))
            self.logits = tf.matmul(self.pool_flat, softmax_w) + softmax_b
            self.y = tf.nn.softmax(self.logits)

        # CalculateMean cross-entropy loss
        with tf.name_scope('loss'):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(losses)

        # Accuracy
        with tf.name_scope('accuracy'):
            # 计算预测值
            self.pred = tf.argmax(self.y, 1)
            # 判断两个张亮的每一维度是否相等
            correct_prediction = tf.equal(tf.cast(self.pred, tf.int32), self.input_y)
            # 先将布尔型的数值转为实数型，然后计算平均值
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [8]:
cnn = TextCNN(sequence_length, vocab_size, 
              word_embed_size, num_classes)

In [9]:
train_feed_dict = {cnn.input_x: x_train, cnn.input_y: y_train,}
test_feed_dict  = {cnn.input_x: x_test, cnn.input_y: y_test,}

In [10]:
train_step = tf.train.AdamOptimizer(0.001).minimize(cnn.loss)
sess.run(tf.global_variables_initializer())
STEP = 500
for i in range(STEP):
    batch_data = train[np.random.randint(train.shape[0], size=batch_size), :]
    X = batch_data[:, :-1]
    Y = batch_data[:, -1:].reshape((-1,))
    feed_dict = {cnn.input_x: X, cnn.input_y: Y}
    sess.run(train_step, feed_dict=feed_dict)
    if i % 50 == 0:
        total_cross_entropy = sess.run(cnn.loss, feed_dict=feed_dict)
        train_accuracy = sess.run(cnn.accuracy, feed_dict=train_feed_dict)
        test_accuracy = sess.run(cnn.accuracy, feed_dict=test_feed_dict)
        test_prediction = sess.run(cnn.pred, feed_dict=test_feed_dict)
        print("After %d training step(s), cross entropy on batch data is "
              "%f, trian accuracy is %.2f, test accuracy is %.2f" % (
                  i, total_cross_entropy, train_accuracy, test_accuracy))

After 0 training step(s), cross entropy on batch data is 10.103700, trian accuracy is 0.47, test accuracy is 0.47
After 50 training step(s), cross entropy on batch data is 5.291380, trian accuracy is 0.49, test accuracy is 0.49
After 100 training step(s), cross entropy on batch data is 2.060752, trian accuracy is 0.52, test accuracy is 0.51
After 150 training step(s), cross entropy on batch data is 0.602189, trian accuracy is 0.58, test accuracy is 0.57
After 200 training step(s), cross entropy on batch data is 0.867739, trian accuracy is 0.62, test accuracy is 0.60
After 250 training step(s), cross entropy on batch data is 0.643813, trian accuracy is 0.66, test accuracy is 0.63
After 300 training step(s), cross entropy on batch data is 0.710485, trian accuracy is 0.68, test accuracy is 0.64
After 350 training step(s), cross entropy on batch data is 0.694074, trian accuracy is 0.69, test accuracy is 0.64
After 400 training step(s), cross entropy on batch data is 0.673931, trian accurac

### 3. 复用模型并查看预测时间

为了增强测试时间函数的可复用性，写成装饰器的形式

In [11]:
import time
from functools import wraps

def timethis(func):
    '''
    Decorator that reports the execution time.
    '''
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print('{:.5f}'.format(end-start))
        return result
    return wrapper

In [12]:
@timethis
def test_prediction(feed_dict):
    sess.run(cnn.pred, feed_dict=feed_dict)

In [13]:
def get_batch_data(data, batch_size):
    batch_data = data[np.random.randint(data.shape[0], size=batch_size), :]
    X = batch_data[:, :-1]
    Y = batch_data[:, -1:].reshape((-1,))
    feed_dict = {cnn.input_x: X, cnn.input_y: Y}
    return feed_dict

In [14]:
feed_dict_batch1 = get_batch_data(test, 1)
feed_dict_batch128 = get_batch_data(test, 128)

In [15]:
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    print('When batch size is {:3d}, it takes:'.format(1))
    test_prediction(feed_dict_batch1)
    print('---' * 5)
    print('When batch size is {:3d}, it takes:'.format(128))
    test_prediction(feed_dict_batch128)

When batch size is   1, it takes:
0.00081
---------------
When batch size is 128, it takes:
0.00518


从上面的时间测算可以看出，后者消耗差不多是前者的 6 倍，但感觉还是区别不大啊