In [1]:
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
import operator
import numpy as np
import pandas as pd
import multiprocessing
import tensorflow as tf

In [3]:
from collections import Counter

## 1. 导入数据并做数据清理

直接导入之前清理好的数据

In [4]:
df = pd.read_pickle('data/cleared_data.pkl')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35124 entries, 0 to 35123
Data columns (total 6 columns):
review           35124 non-null object
sentiment        35124 non-null object
cut_words        35124 non-null object
cleared_words    35124 non-null object
counter          35124 non-null object
words_count      35124 non-null int64
dtypes: int64(1), object(5)
memory usage: 1.6+ MB


In [6]:
def other_multiprocessing(df, func, workers):

    chunk_size = int(df.shape[0] / workers)
    chunks = (df.ix[df.index[i:i + chunk_size]] for i in range(0, df.shape[0], chunk_size))

    pool = multiprocessing.Pool(processes=4)
    result = pool.map(func, chunks)
    return result


def sum_func(d):
    return d.counter.sum()

In [7]:
%%time
result = other_multiprocessing(df, sum_func, workers=4)
counter_sum = np.sum(np.asarray(result))

CPU times: user 2.08 s, sys: 468 ms, total: 2.55 s
Wall time: 2min 13s


高频截断，出现次数 10 次以下的词作为 unknown 词，同时用此计算 vocabulary_size

In [8]:
d = {i: j for i, j in counter_sum.items() if j > 10}

In [9]:
sorted_dict = sorted(d.items(), key=operator.itemgetter(1), reverse=True)

In [10]:
vocab_size = len(sorted_dict) + 1  # 加一个未登录词

In [11]:
sentences = df.cleared_words.tolist()

In [12]:
def build_dict(word_counts):
    # Build word dictionary
    count = [['UNK', -1]]
    count.extend(word_counts)
    word_dict = {}
    for word, _ in count:
        word_dict[word] = len(word_dict)

    # Build reversed dictionary
    reversed_dict = {j: i for i, j in word_dict.items()}

    return word_dict, reversed_dict

In [13]:
word_dict, reversed_dict = build_dict(sorted_dict)

In [14]:
def word_to_number(sentences, word_dict):
    # Word to number
    data = []
    for sentence in sentences:
        sentence_data = []
        for word in sentence:
            if word in word_dict:
                index = word_dict[word]
            else:
                index = 0
            sentence_data.append(index)
        data.append(sentence_data)
    return data

In [15]:
data = word_to_number(sentences, word_dict)

In [16]:
df['word_to_number'] = np.asarray(data)

## 2. 设定数据集

In [17]:
df = df[df.words_count > 2]  # 至少能形成一个 windows_size
train = df[(df.sentiment == 1) | (df.sentiment == 0)]
test  = df[~df.index.isin(train.index)].copy()

In [18]:
test['sentiment'] = test.sentiment.map(lambda x: int(x[0]))

In [19]:
text_data_train = train.word_to_number.values
text_data_test = test.word_to_number.values

In [20]:
df.words_count.describe()

count    34051.000000
mean        24.038648
std         24.401450
min          3.000000
25%          9.000000
50%         16.000000
75%         31.000000
max        894.000000
Name: words_count, dtype: float64

从上方的词长分布设定句子长度

* 利用 list 相加会合并的特性
* array + list 也有上述特性

In [21]:
max_words = 20

In [22]:
text_data_train = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_train]])
text_data_test = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_test]])

target 转为 2 维表示 `(0, 1)`, 方便测试 softmax_cross_entropy_with_logits：
```
0 -> (1, 0); 
1 -> (0, 1)```

In [23]:
target_train = train['sentiment'].map(int).values
target_test = test['sentiment'].map(int).values

target_data_train = np.zeros(shape=(target_train.shape[0], 2))
target_data_test = np.zeros(shape=(target_test.shape[0], 2))

# row 即第一维
target_train_row_idx = np.arange(target_train.shape[0])
target_test_row_idx = np.arange(target_test.shape[0])

# target_train 的每个值其实也是 target 的方位
target_data_train[target_train_row_idx, target_train] = 1
target_data_test[target_test_row_idx, target_test] = 1

## 3. 构建模型

In [24]:
word_embedding_dim = 128
data_size = text_data_train.shape[0]
batch_size = 128

In [25]:
input_data = tf.placeholder(tf.int32, shape=[None, 20], name='input_data')
labels = tf.placeholder(tf.int32, shape=[None, 2], name='labels')

In [26]:
word_embedding = tf.Variable(
    tf.random_uniform([vocab_size, word_embedding_dim])
)

In [27]:
input_embeds = tf.nn.embedding_lookup(word_embedding, input_data)
context_embeds = tf.reduce_mean(input_embeds, axis=1)

In [28]:
# 计算前向传播结果
raw_output = tf.layers.dense(context_embeds, 2)
output = tf.nn.softmax(raw_output)

In [29]:
# cost = tf.nn.softmax_cross_entropy_with_logits(logits=raw_output, labels=labels)
# 一个正确答案的时候，其实用 tf.nn.sparse_softmax_cross_entropy_with_logits 就好了，但为了测试上面的
#     tf.nn.softmax_cross_entropy_with_logits 函数，labels 已被转为 2 维，所以需要重新获取下编号
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=raw_output, labels=tf.arg_max(labels, 1))  

In [30]:
# 判断两个张亮的每一维度是否相等
correct_prediction = tf.equal(tf.argmax(output, 1), tf.argmax(labels, 1))
# 先将布尔型的数值转为实数型，然后计算平均值
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# 设定两者的 feed_dict 方便计算 accuracy
train_feed_dict = {
    input_data: text_data_train, 
    labels: target_data_train,
}
test_feed_dict = {
    input_data: text_data_test, 
    labels: target_data_test,
}

In [31]:
train_step = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    STEP = 10000
    for i in range(STEP):
        start = (i * batch_size) % (data_size - (data_size % batch_size))
        end = (i * batch_size) % (data_size - (data_size % batch_size)) + batch_size
        feed_dict = {
            input_data: text_data_train[start:end], 
            labels: target_data_train[start:end],
        }
        sess.run(train_step, feed_dict=feed_dict)
        if i % 500 == 0:
            total_cross_entropy = cost.eval(feed_dict=feed_dict)[0]
            train_accuracy = accuracy.eval(feed_dict=train_feed_dict)
            test_accuracy = accuracy.eval(feed_dict=test_feed_dict)
            print("After %d training step(s), cross entropy on batch data is "
                  "%f, trian accuracy is %.2f, test accuracy is %.2f" % (
                      i, total_cross_entropy, train_accuracy, test_accuracy))

After 0 training step(s), cross entropy on batch data is 0.287002, trian accuracy is 0.53, test accuracy is 0.53
After 500 training step(s), cross entropy on batch data is 0.048961, trian accuracy is 0.47, test accuracy is 0.47
After 1000 training step(s), cross entropy on batch data is 0.013362, trian accuracy is 0.53, test accuracy is 0.53
After 1500 training step(s), cross entropy on batch data is 0.185217, trian accuracy is 0.53, test accuracy is 0.53
After 2000 training step(s), cross entropy on batch data is 0.031848, trian accuracy is 0.47, test accuracy is 0.47
After 2500 training step(s), cross entropy on batch data is 0.015394, trian accuracy is 0.53, test accuracy is 0.53
After 3000 training step(s), cross entropy on batch data is 0.088409, trian accuracy is 0.53, test accuracy is 0.53
After 3500 training step(s), cross entropy on batch data is 0.023557, trian accuracy is 0.47, test accuracy is 0.47
After 4000 training step(s), cross entropy on batch data is 0.019733, trian 

#### 试试 mini-batch

In [32]:
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    STEP = 10000
    # 先把 input_data 和 labels 按列拼接，方便打乱两者索引
    data = np.concatenate((text_data_train, target_data_train), axis=1) 
    for i in range(STEP):
        batch_data = data[np.random.randint(data.shape[0], size=50), :]
        X = batch_data[:, :-2]
        Y = batch_data[:, -2:]
        feed_dict={input_data: X, labels: Y}
        sess.run(train_step, feed_dict=feed_dict)
        if i % 500 == 0:
            total_cross_entropy = cost.eval(feed_dict=feed_dict)[0]
            train_accuracy = accuracy.eval(feed_dict=train_feed_dict)
            test_accuracy = accuracy.eval(feed_dict=test_feed_dict)
            print("After %d training step(s), cross entropy on batch data is "
                  "%f, trian accuracy is %.2f, test accuracy is %.2f" % (
                      i, total_cross_entropy, train_accuracy, test_accuracy))

After 0 training step(s), cross entropy on batch data is 0.515902, trian accuracy is 0.53, test accuracy is 0.53
After 500 training step(s), cross entropy on batch data is 0.774015, trian accuracy is 0.57, test accuracy is 0.57
After 1000 training step(s), cross entropy on batch data is 0.617369, trian accuracy is 0.54, test accuracy is 0.54
After 1500 training step(s), cross entropy on batch data is 0.692724, trian accuracy is 0.56, test accuracy is 0.56
After 2000 training step(s), cross entropy on batch data is 0.608744, trian accuracy is 0.63, test accuracy is 0.63
After 2500 training step(s), cross entropy on batch data is 0.743858, trian accuracy is 0.63, test accuracy is 0.63
After 3000 training step(s), cross entropy on batch data is 0.613086, trian accuracy is 0.67, test accuracy is 0.66
After 3500 training step(s), cross entropy on batch data is 0.247247, trian accuracy is 0.68, test accuracy is 0.68
After 4000 training step(s), cross entropy on batch data is 0.301995, trian 

In [33]:
train_step = tf.train.AdamOptimizer(0.001).minimize(cost)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    STEP = 10000
    # 先把 input_data 和 labels 按列拼接，方便打乱两者索引
    data = np.concatenate((text_data_train, target_data_train), axis=1) 
    for i in range(STEP):
        batch_data = data[np.random.randint(data.shape[0], size=50), :]
        X = batch_data[:, :-2]
        Y = batch_data[:, -2:]
        feed_dict={input_data: X, labels: Y}
        sess.run(train_step, feed_dict=feed_dict)
        if i % 500 == 0:
            total_cross_entropy = cost.eval(feed_dict=feed_dict)[0]
            train_accuracy = accuracy.eval(feed_dict=train_feed_dict)
            test_accuracy = accuracy.eval(feed_dict=test_feed_dict)
            print("After %d training step(s), cross entropy on batch data is "
                  "%f, trian accuracy is %.2f, test accuracy is %.2f" % (
                      i, total_cross_entropy, train_accuracy, test_accuracy))

After 0 training step(s), cross entropy on batch data is 0.809544, trian accuracy is 0.55, test accuracy is 0.55
After 500 training step(s), cross entropy on batch data is 0.534802, trian accuracy is 0.82, test accuracy is 0.80
After 1000 training step(s), cross entropy on batch data is 0.568108, trian accuracy is 0.85, test accuracy is 0.82
After 1500 training step(s), cross entropy on batch data is 0.418481, trian accuracy is 0.87, test accuracy is 0.84
After 2000 training step(s), cross entropy on batch data is 0.045808, trian accuracy is 0.89, test accuracy is 0.85
After 2500 training step(s), cross entropy on batch data is 0.164741, trian accuracy is 0.90, test accuracy is 0.85
After 3000 training step(s), cross entropy on batch data is 0.384198, trian accuracy is 0.91, test accuracy is 0.86
After 3500 training step(s), cross entropy on batch data is 0.372669, trian accuracy is 0.92, test accuracy is 0.86
After 4000 training step(s), cross entropy on batch data is 0.745511, trian 

同样是 反向传播 + minibatch，但不知道为啥，AdamOptimizer 相比梯度要好很多。