In [49]:
neg_file = './data/rt-polarity.neg'
pos_file = './data/rt-polarity.pos'

In [50]:
import re

def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


In [51]:
with open(pos_file, 'r', encoding='utf8') as f:
    
    for i, line in enumerate(f):
        print('##', i,  line)
        print('##', i,  clean_str(line))
        print()
        print()
        if i > 1:
            break

## 0 the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

## 0 the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal


## 1 the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

## 1 the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words cannot adequately describe co writer director peter jackson 's expanded vision of j r r tolkien 's middle earth


## 2 effective but too-tepid biopic

## 2 effective but too tepid biopic




In [52]:
## 加载数据
import numpy as np

positive_examples = list(open(pos_file, 'r', encoding='utf8').readlines())
negative_examples = list(open(neg_file, 'r', encoding='utf8').readlines())

x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]

# generate labels
positive_labels = [[1, 0] for _ in positive_examples]
negative_labels = [[0, 1] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)

In [53]:
print(x_text[:2])
print(y[:2])

["the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal", "the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words cannot adequately describe co writer director peter jackson 's expanded vision of j r r tolkien 's middle earth"]
[[1 0]
 [1 0]]


In [54]:
data_size = len(y)
data_size

10662

In [57]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    data = np.array(data)
    data_size = len(data)
    
    num_batches_of_one_epoch = int((len(data) - 1) / batch_size) + 1
    for epoch in range(num_epochs):
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffle_data = data[shuffle_indices]
        else:
            shuffle_data = data
        # 取数据
        for batch_id in range(num_batches_of_one_epoch):
            start_index = batch_id * batch_size
            end_index = (batch_id + 1) * batch_size
            end_index = min(end_index, data_size)
            yield shuffle_data[start_index:end_index]

In [66]:
data = np.arange(22)
print(data)

for batch in batch_iter(data, batch_size=5, num_epochs=2, shuffle=False):
    print('batch:', batch)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
batch: [0 1 2 3 4]
batch: [5 6 7 8 9]
batch: [10 11 12 13 14]
batch: [15 16 17 18 19]
batch: [20 21]
batch: [0 1 2 3 4]
batch: [5 6 7 8 9]
batch: [10 11 12 13 14]
batch: [15 16 17 18 19]
batch: [20 21]


In [197]:
x = np.arange(22)
y = x * 10
print(x)
print(y)

data = list(zip(x, y))
print(data)

for i, batch in enumerate(batch_iter(data=data, batch_size=5, num_epochs=1, shuffle=False)):
    print('batch %d:' % i, batch)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
[  0  10  20  30  40  50  60  70  80  90 100 110 120 130 140 150 160 170
 180 190 200 210]
[(0, 0), (1, 10), (2, 20), (3, 30), (4, 40), (5, 50), (6, 60), (7, 70), (8, 80), (9, 90), (10, 100), (11, 110), (12, 120), (13, 130), (14, 140), (15, 150), (16, 160), (17, 170), (18, 180), (19, 190), (20, 200), (21, 210)]
batch 0: [[ 0  0]
 [ 1 10]
 [ 2 20]
 [ 3 30]
 [ 4 40]]
batch 1: [[ 5 50]
 [ 6 60]
 [ 7 70]
 [ 8 80]
 [ 9 90]]
batch 2: [[ 10 100]
 [ 11 110]
 [ 12 120]
 [ 13 130]
 [ 14 140]]
batch 3: [[ 15 150]
 [ 16 160]
 [ 17 170]
 [ 18 180]
 [ 19 190]]
batch 4: [[ 20 200]
 [ 21 210]]


## 为模型准备数据

In [115]:
max_document_length = max(len(x.split(' ')) for x in x_text)
max_document_length

56

In [265]:
# 构建词典
from tensorflow.contrib import learn

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
vocab_processor

# 由单词组成的文本，被转换为 由单词索引构成的列表
x = np.array(list(vocab_processor.fit_transform(x_text)))

for i, word_idxs in enumerate(x[:3]):
    print(word_idxs)
    print(x_text[i])

[ 1  2  3  4  5  6  1  7  8  9 10 11 12 13 14  9 15  5 16 17 18 19 20 21
 22 23 24 25 26 27 28 29 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]
the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal
[ 1 31 32 33 34  1 35 34  1 36 37  3 38 39 13 17 40 34 41 42 43 44 45 46
 47 48 49  9 50 51 34 52 53 53 54  9 55 56  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]
the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words cannot adequately describe co writer director peter jackson 's expanded vision of j r r tolkien 's middle earth
[57 58 59 60 61  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]
effective but too tepid biopic


In [267]:
# 尝试由单词索引列表，再转换为文本
# 获取单词的索引：vocab_processor.vocabulary_.get('rock')
# 由索引获取单词：vocab_processor.vocabulary_.reverse(2)
print(vocab_processor.vocabulary_.reverse(0))

' '.join([vocab_processor.vocabulary_.reverse(word_id) for word_id in x[0]])

<UNK>


"the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>"

In [272]:
print(np.max(x))
len(vocab_processor.vocabulary_)

18757


18758

In [None]:
# 打印词表
print({vocab_processor.vocabulary_.reverse(word_id): word_id for word_id in range(len(vocab_processor.vocabulary_))})


In [275]:
{word: i for i, word in enumerate(['dd', 'ee', 'ff'])}

{'dd': 0, 'ee': 1, 'ff': 2}

## 把数据切分为训练集核测试集

In [157]:
print(x.shape)
print(y.shape)

(10662, 56)
(10662, 2)


In [185]:
# 打乱顺序
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# 划分训练集和测试集
test_percent = 0.1
test_num_samples = int(test_percent * len(y))
x_train, x_test = x_shuffled[:-test_num_samples], x_shuffled[-test_num_samples:]
y_train, y_test = y_shuffled[:-test_num_samples], y_shuffled[-test_num_samples:]

print('train/test split: {}, {}'.format(len(y_train), len(y_test)))

train/test split: 9596, 1066


In [183]:
arr1 = np.arange(10)
print(arr1)

# 从前面取 test
print('test:', arr1[:2])
print('train:', arr1[2:])
# 先取 train，剩余的做 test
print('test:', arr1[-2:])
print('train:', arr1[:-2])

[0 1 2 3 4 5 6 7 8 9]
test: [0 1]
train: [2 3 4 5 6 7 8 9]
test: [8 9]
train: [0 1 2 3 4 5 6 7]


In [237]:
print(x_test[:2])
print(y_test[:2])

[[  292    84   523  1889    99   100   274    67    13 15402   121  4596
    600   722  1456  2279   944   207  8493   503   125 10507     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [    1    89 11697   826  1012     1  3666    34     1   511   657   146
    483    84   249  1798    17   899  2508  4084    58    78   539    12
    736  3530    34    17  4635     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]]
[[0 1]
 [1 0]]


In [238]:
# 合并到一起，一个元素代表一个样本
test_dataset = list(zip(x_test, y_test))
test_batch_iter = batch_iter(data=test_dataset, batch_size=2, num_epochs=1, shuffle=False)
for step, batch in enumerate(test_batch_iter):
    print('step', step)
    print('  x:', batch[:,0])
    print('  y:', batch[:,1])
    if step == 2:
        break

step 0
  x: [array([  292,    84,   523,  1889,    99,   100,   274,    67,    13,
       15402,   121,  4596,   600,   722,  1456,  2279,   944,   207,
        8493,   503,   125, 10507,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int64)
 array([    1,    89, 11697,   826,  1012,     1,  3666,    34,     1,
         511,   657,   146,   483,    84,   249,  1798,    17,   899,
        2508,  4084,    58,    78,   539,    12,   736,  3530,    34,
          17,  4635,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int64)]
  y: [array([0, 1]) array([1, 0])]
step 1
  x: [array([   47,  3542, 17584,

In [257]:
for step, batch in enumerate(test_batch_iter):
    print('step', step)
    print('  x:', type(batch[:,0]))
    print('  y:', batch[:,1])
    print('  y:', np.array(batch[:,1][0]))
    if step == 2:
        break

step 0
  x: <class 'numpy.ndarray'>
  y: [array([1, 0]) array([1, 0]) array([1, 0]) array([0, 1]) array([0, 1])]
  y: [1 0]
step 1
  x: <class 'numpy.ndarray'>
  y: [array([1, 0]) array([1, 0]) array([0, 1]) array([0, 1]) array([0, 1])]
  y: [1 0]
step 2
  x: <class 'numpy.ndarray'>
  y: [array([0, 1]) array([0, 1]) array([0, 1]) array([0, 1]) array([0, 1])]
  y: [0 1]


In [252]:
# 合并到一起，一个元素代表一个样本
test_dataset = list(zip(x_test, y_test))
test_batch_iter = batch_iter(data=test_dataset, batch_size=5, num_epochs=1, shuffle=False)
for step, batch in enumerate(test_batch_iter):
    batch_xs, batch_ys = zip(*batch)
    print('step', step)
    print('  x:', type(batch_xs))
    print('  y:', np.array(batch_ys))
    if step == 2:
        break

step 0
  x: <class 'tuple'>
  y: [[0 1]
 [1 0]
 [0 1]
 [0 1]
 [1 0]]
step 1
  x: <class 'tuple'>
  y: [[0 1]
 [0 1]
 [1 0]
 [1 0]
 [1 0]]
step 2
  x: <class 'tuple'>
  y: [[1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]]


## 搭建模型

In [None]:
import tensorflow as tf

In [111]:


class TextCNN(object):
    
    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters):
        
        self.sequence_length = sequence_length
        self.num_classes = num_classes
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.filter_sizes = filter_sizes
        self.num_filters = num_filters
        
        self.weights = []   # trainable_variables 
        
    def __call__(self, inputs, keep_drop=1.0):
        """ inputs 输入数据的形状为:
        tf.placeholder(tf.int32, [None, sequence_length], name='input_x')
        [
            [词索引, 词索引, ....],  # 代表一个句子
            [词索引, 词索引, ....],  # 代表一个句子
            [],
            ..
        ]
        """
        with tf.name_scope('embedding'):
            embedding = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], minval=-1.0, maxval=1.0), 
                                         name='embedding_matrix')
            # 词索引 -> 词向量, 假设 sequence_length=100, embedding_size=256
            # (None, 100) -> (None, 100, 256)
            inputs_embedded = tf.nn.embedding_lookup(embedding, inputs)
            # 转化为当通道图像的二维卷积   
            # (None, 100, 256) -> (None, 100, 256, 1)
            inputs_embedded_expanded = tf.expand_dims(inputs_embedded, -1)
            print('##', inputs)
            print('##', inputs_embedded)
            print('##', inputs_embedded_expanded)
            # 记录词向量矩阵
            self.weights.append(embedding)
            
            
        flattened_pooled_outputs = []    
        for i, filter_size in enumerate(self.filter_sizes):
            conv_op_name = 'conv_%s' % (i + 1)
            with tf.name_scope(conv_op_name):
                filter_shape = [filter_size, self.embedding_size, 1, self.num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='kernel')
                b = tf.Variable(tf.constant(0.1, shape=(self.num_filters,)), name='bias')
                # 卷积操作，假设 filter_size=3, num_filters=64
                # (None, 100, 256, 1) -> (None, 98, 1, 64)
                conv_out = tf.nn.conv2d(inputs_embedded_expanded, W, strides=[1,1,1,1], padding='VALID', name=conv_op_name)
                print('##', conv_out)
                # 加偏置并激活
                conv_out = tf.nn.relu(tf.nn.bias_add(conv_out, b))
                # 记录卷积核及偏置
                self.weights.append(W)
                self.weights.append(b)
            
            maxpool_op_name = 'maxpool_%s' % (i + 1)
            with tf.name_scope(maxpool_op_name):
                pool_height_size = self.sequence_length - filter_size + 1
                pool_width = 1
                # 池化操作
                # (None, 98, 1, 64) -> (None, 1, 1, 64)
                pooled_out = tf.nn.max_pool(conv_out, ksize=[1, pool_height_size, pool_width, 1], strides=[1,1,1,1], 
                                          padding='VALID', name=maxpool_op_name)
                print('##', pooled_out)
                # (None, 1, 1, 64) -> (None, 64)
                flattened_pooled_out = tf.reshape(pooled_out, shape=(-1, self.num_filters))
                print("** Flaten:", flattened_pooled_out)
                flattened_pooled_outputs.append(flattened_pooled_out)
                
                
        
        # 把所有卷积池化后的结构拼接为一个长的向量作为全连接层的输入
        fc1_in = tf.concat(flattened_pooled_outputs, axis=1)
        print("## fc1 inputs:", fc1_in)
        
        with tf.name_scope('fc1'):
            input_units = fc1_in.shape[1].value
            print("fc1 input units:", input_units)
            W = tf.Variable(tf.truncated_normal((input_units, self.num_classes), stddev=0.1), name='kernel')
            b = tf.Variable(tf.constant(0.1, shape=(self.num_classes,)), name='bias')
            fc1_out = tf.nn.bias_add(tf.matmul(fc1_in, W), b) 
            print("## fc1 outputs:", fc1_out)
            self.weights.append(W)
            self.weights.append(b)
        # 记录模型的输出 Tensor, 即各类别的打分数据，在调用处，用于优化损失
        self.output = fc1_out
        return self.output         


text_cnn = TextCNN(sequence_length=100, num_classes=3, 
                   vocab_size=10000, embedding_size=256, 
                   filter_sizes=[3,4,5], num_filters=64)

x = tf.placeholder(tf.int32, [2, 100], name='input_x')
output = text_cnn(x)

print("trainable_variables:")
for weight in text_cnn.weights:
    print(weight)
    

## Tensor("input_x_32:0", shape=(2, 100), dtype=int32)
## Tensor("embedding_32/embedding_lookup/Identity:0", shape=(2, 100, 256), dtype=float32)
## Tensor("embedding_32/ExpandDims:0", shape=(2, 100, 256, 1), dtype=float32)
## Tensor("conv_1_28/conv_1:0", shape=(2, 98, 1, 64), dtype=float32)
## Tensor("maxpool_1_28/maxpool_1:0", shape=(2, 1, 1, 64), dtype=float32)
** Flaten: Tensor("maxpool_1_28/Reshape:0", shape=(2, 64), dtype=float32)
## Tensor("conv_2_28/conv_2:0", shape=(2, 97, 1, 64), dtype=float32)
## Tensor("maxpool_2_28/maxpool_2:0", shape=(2, 1, 1, 64), dtype=float32)
** Flaten: Tensor("maxpool_2_28/Reshape:0", shape=(2, 64), dtype=float32)
## Tensor("conv_3_28/conv_3:0", shape=(2, 96, 1, 64), dtype=float32)
## Tensor("maxpool_3_28/maxpool_3:0", shape=(2, 1, 1, 64), dtype=float32)
** Flaten: Tensor("maxpool_3_28/Reshape:0", shape=(2, 64), dtype=float32)
## fc1 inputs: Tensor("concat_15:0", shape=(2, 192), dtype=float32)
fc1 input units: 192
## fc1 outputs: Tensor("fc1_14/BiasAd

## 训练模型

In [231]:
sequence_length = max_document_length
num_classes = 2
vocab_size = len(vocab_processor.vocabulary_)
embedding_size = 256
filter_sizes = [3, 4, 5]
num_filters = 64


text_cnn = TextCNN(sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters)

x = tf.placeholder(tf.int32, [None, sequence_length], name='input_x')
y_ = tf.placeholder(tf.float32, [None, num_classes], name='input_y')


output = text_cnn(x)

## Tensor("input_x_36:0", shape=(?, 56), dtype=int32)
## Tensor("embedding_36/embedding_lookup/Identity:0", shape=(?, 56, 256), dtype=float32)
## Tensor("embedding_36/ExpandDims:0", shape=(?, 56, 256, 1), dtype=float32)
## Tensor("conv_1_32/conv_1:0", shape=(?, 54, 1, 64), dtype=float32)
## Tensor("maxpool_1_32/maxpool_1:0", shape=(?, 1, 1, 64), dtype=float32)
** Flaten: Tensor("maxpool_1_32/Reshape:0", shape=(?, 64), dtype=float32)
## Tensor("conv_2_32/conv_2:0", shape=(?, 53, 1, 64), dtype=float32)
## Tensor("maxpool_2_32/maxpool_2:0", shape=(?, 1, 1, 64), dtype=float32)
** Flaten: Tensor("maxpool_2_32/Reshape:0", shape=(?, 64), dtype=float32)
## Tensor("conv_3_32/conv_3:0", shape=(?, 52, 1, 64), dtype=float32)
## Tensor("maxpool_3_32/maxpool_3:0", shape=(?, 1, 1, 64), dtype=float32)
** Flaten: Tensor("maxpool_3_32/Reshape:0", shape=(?, 64), dtype=float32)
## fc1 inputs: Tensor("concat_19:0", shape=(?, 192), dtype=float32)
fc1 input units: 192
## fc1 outputs: Tensor("fc1_18/BiasAdd:0

In [236]:
def train(output, x, y_):
    # 交叉熵损失
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=output))
    # 选择梯度优化算法
    train_step = tf.train.AdamOptimizer().minimize(loss)

    correction_predictions = tf.equal(tf.argmax(output, axis=1), tf.argmax(y_, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correction_predictions, dtype=tf.float32))
    

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        batch_size = 128
        num_steps = 1000
        eval_per_steps = 30
        
        train_dataset = list(zip(x_train, y_train))
        test_dataset = list(zip(x_test, y_test))
          
        train_batch_iter = batch_iter(data=train_dataset, batch_size=50, num_epochs=1, shuffle=False)
        test_batch_iter = batch_iter(data=test_dataset, batch_size=50, num_epochs=1, shuffle=False)
        
        for step, batch in enumerate(train_batch_iter):
            batch_xs, batch_ys = zip(*batch)
                
            _, _train_loss = sess.run([train_step, loss],
                                      feed_dict={ x: batch_xs, y_: batch_ys })

            # 每隔100步计算一下模型的准确度
            if step % eval_per_steps == 0:
                _train_acc, _train_loss = sess.run([accuracy, loss],
                                                   feed_dict={ x: batch_xs, y_: batch_ys })
                _test_loss_total = 0.0
                _test_acc_total = 0.0
                
                for i, batch in enumerate(test_batch_iter):
                    batch_xs, batch_ys = zip(*batch)
                    # 获取一批测试数据，计算准确度
                    _test_acc, _test_loss = sess.run([accuracy, loss],
                                                     feed_dict={ x: batch_xs, y_: batch_ys })
                    _test_loss_total += _test_loss
                    _test_acc_total += _test_acc
                    
                print('step {} - aac: {:.4f}, loss: {:.4f}, test acc: {:.4f}, test loss: {:.4f}'.format(
                        step, _train_acc, _train_loss, _test_acc_total / (i+1), _test_loss_total / (i+1)))


train(output, x, y_)

step 0 - aac: 0.0000, loss: 1.7442, test acc: 0.4831, test loss: 3.5999


<tf.Tensor 'Equal:0' shape=(?,) dtype=bool>