In [1]:
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
import operator
import numpy as np
import pandas as pd
import multiprocessing
import tensorflow as tf

In [3]:
from collections import Counter

直接导入之前清理好的数据

In [4]:
df = pd.read_pickle('data/cleared_data.pkl')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35124 entries, 0 to 35123
Data columns (total 6 columns):
review           35124 non-null object
sentiment        35124 non-null object
cut_words        35124 non-null object
cleared_words    35124 non-null object
counter          35124 non-null object
words_count      35124 non-null int64
dtypes: int64(1), object(5)
memory usage: 1.6+ MB


In [6]:
def other_multiprocessing(df, func, workers):

    chunk_size = int(df.shape[0] / workers)
    chunks = (df.ix[df.index[i:i + chunk_size]] for i in range(0, df.shape[0], chunk_size))

    pool = multiprocessing.Pool(processes=4)
    result = pool.map(func, chunks)
    return result


def sum_func(d):
    return d.counter.sum()

In [7]:
%%time
result = other_multiprocessing(df, sum_func, workers=4)
counter_sum = np.sum(np.asarray(result))

CPU times: user 2.06 s, sys: 460 ms, total: 2.52 s
Wall time: 2min 11s


高频截断，出现次数 10 次以下的词作为 unknown 词，同时用此计算 vocabulary_size

In [8]:
d = {i: j for i, j in counter_sum.items() if j > 10}

In [9]:
sorted_dict = sorted(d.items(), key=operator.itemgetter(1), reverse=True)

In [10]:
vocabulary_size = len(sorted_dict)
vocabulary_size

7920

In [11]:
sentences = df.cleared_words.tolist()

In [12]:
def build_dataset(word_counts):
    # Build word dictionary
    count = [['UNK', -1]]
    count.extend(word_counts)
    word_dict = {}
    for word, _ in count:
        word_dict[word] = len(word_dict)

    # Build reversed dictionary
    reversed_dict = {j: i for i, j in word_dict.items()}

    return word_dict, reversed_dict

In [13]:
word_dict, reversed_dict = build_dataset(sorted_dict)

In [14]:
def word_to_number(sentences, word_dict):
    # Word to number
    data = []
    for sentence in sentences:
        sentence_data = []
        for word in sentence:
            if word in word_dict:
                index = word_dict[word]
            else:
                index = 0
            sentence_data.append(index)
        data.append(sentence_data)
    return data

In [15]:
data = word_to_number(sentences, word_dict)

In [16]:
df['word_to_number'] = np.asarray(data)

### 设定数据集

In [17]:
df = df[df.words_count > 2]  # 至少能形成一个 windows_size
train = df[(df.sentiment == 1) | (df.sentiment == 0)]
test  = df[~df.index.isin(train.index)].copy()

In [18]:
test['sentiment'] = test.sentiment.map(lambda x: int(x[0]))

In [19]:
text_data_train = train.word_to_number.values
text_data_test = test.word_to_number.values

target_train = train.sentiment.values
target_test = test.sentiment.values

In [20]:
df.words_count.describe()

count    34051.000000
mean        24.038648
std         24.401450
min          3.000000
25%          9.000000
50%         16.000000
75%         31.000000
max        894.000000
Name: words_count, dtype: float64

大部分评论是在 100 字以下，所以为了更好的训练模型，设定句子长度为 100，不足 100 以 0 填充:

* 利用 list 相加会合并的特性
* array + list 也有上述特性

In [21]:
max_words = 100

In [22]:
text_data_train = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_train]])
text_data_test = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_test]])

In [23]:
text_data_test

array([[  95,  946,   67, ...,    0,    0,    0],
       [   1,   81,  279, ...,    0,    0,    0],
       [  60,  393,   36, ...,    0,    0,    0],
       ..., 
       [2147,  261,  645, ...,    0,    0,    0],
       [1108,   10,  251, ...,    0,    0,    0],
       [2460, 1055,  531, ...,    0,    0,    0]])

In [24]:
np.random.randint(0, 10000)

8885

In [25]:
target_train.shape

(23864,)

In [26]:
tst_text_train = ['酒店 不好'.split(),
                  '很 失望'.split(),
                  '床 好'.split(),
                  '非常 棒'.split(),]

In [27]:
tst_text_test = ['棒 棒'.split(),
                  '我 失望'.split(),
                  '这个 好'.split()]

In [28]:
tst_target_train = np.array([0, 0, 1, 1]).reshape((4, 1))

In [29]:
tst_text_data_test = word_to_number(tst_text_test, word_dict)

In [30]:
tst_text_data_test

[[1424, 1424], [0, 137], [0, 0]]

In [31]:
tst_target_train

array([[0],
       [0],
       [1],
       [1]])

In [32]:
tst_text_data_trian = word_to_number(tst_text_train, word_dict)

In [33]:
tst_text_data_trian

[[1, 51], [0, 137], [347, 0], [14, 1424]]

In [34]:
tst_target_train.shape[0]

4

In [35]:
tst_target_data_train = np.zeros(shape=(tst_target_train.shape[0], 2))

In [36]:
tst_target_data_train[:, 1] = tst_target_train[:, 0]  # 用 2 维的方式表示 (0, 1)，0 -> (1, 0); 1 -> (0, 1)

In [37]:
tst_target_data_train[:, 0] = np.where(tst_target_data_train[:, 1]==0, 1, 0)  # 第 2 列跟 1 维表示时是一样的，第 1 列刚好相反

In [38]:
tst_target_data_train

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.]])

In [40]:
# 定义一个判断 accuracy 的函数
def accuracy(preds, labels):
    preds = preds[:, 1]  # 是第 2 列判断即可
    labels = labels[:, 1]
    return ((preds > 0.5) == labels).sum() / float(len(labels))

In [41]:
tst_text_data_test

[[1424, 1424], [0, 137], [0, 0]]

In [42]:
tst_target_data_test = np.array([
    [0, 1],
    [1, 0],
    [0, 1],
])

In [43]:
tst_target_data_train

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.]])

In [45]:
word_embedding_dim = 128
# 决定了词表数量, 预留一个未登录词
vocab_size = 7920
UNK_IDX = 0

In [46]:
word_embedding = tf.word_embedding = tf.Variable(tf.random_uniform([vocab_size, word_embedding_dim]))

In [47]:
input_data = tf.placeholder(tf.int32, shape=[None, 2], name='input_data')
input_embeds = tf.nn.embedding_lookup(word_embedding, input_data)

In [48]:
input_embeds

<tf.Tensor 'embedding_lookup:0' shape=(?, 2, 128) dtype=float32>

In [49]:
context_embeds = tf.reduce_sum(input_embeds, axis=1)

In [50]:
context_embeds

<tf.Tensor 'Sum:0' shape=(?, 128) dtype=float32>

In [51]:
raw_output = tf.layers.dense(context_embeds, 2)

In [52]:
raw_output

<tf.Tensor 'dense/BiasAdd:0' shape=(?, 2) dtype=float32>

In [53]:
output = tf.nn.softmax(raw_output)

In [54]:
output

<tf.Tensor 'Softmax:0' shape=(?, 2) dtype=float32>

In [55]:
# 样本的 labels 也需要用 placeholder 放置
labels = tf.placeholder(tf.int32, shape=[None, 2], name='labels')

# 因为我们每个样本的 label 只有一个，使用稠密的 softmax 算 cost 及求导太浪费了。这里使用 sparse 版本即可。
# 如果你的 label 是完整的 N 个词上的概率分布，这时候可以使用 tf.nn.softmax_cross_entropy_with_logits
# cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=raw_output, labels=labels)
cost = tf.nn.softmax_cross_entropy_with_logits(logits=raw_output, labels=labels)

In [60]:
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    dummy_feed_dict = {input_data: tst_text_data_trian,
                       labels: tst_target_data_train}
    for i in range(100):
        sess.run(train_step, feed_dict=dummy_feed_dict)
        if i % 20 == 0:
            print("Iteration %d" % i)
            print("Cost: %f" % cost.eval(feed_dict=dummy_feed_dict)[0])
            # 查看输出中 ID == 3 的概率
            total_cross_entropy = cost.eval(feed_dict=dummy_feed_dict)[0]
            output_tst = output.eval(feed_dict=dummy_feed_dict)
            output_test = output.eval(feed_dict={input_data: tst_text_data_test})
            train_accuracy = accuracy(output_tst, tst_target_data_train) 
            test_accuracy = accuracy(output_test, tst_target_data_test)
            print("After %d training step(s), cross entropy on all data is "
                  "%3f, trian accuracy is %.2f, test accuracy is %.2f" % (
                      i, total_cross_entropy, train_accuracy, test_accuracy))
#             print("The ouput shape is {}".format(output.eval(feed_dict=dummy_feed_dict).shape))
            print("------")

Iteration 0
Cost: 1.799619
After 0 training step(s), cross entropy on all data is 1.799619, trian accuracy is 0.50, test accuracy is 0.67
------
Iteration 20
Cost: 0.080121
After 20 training step(s), cross entropy on all data is 0.080121, trian accuracy is 1.00, test accuracy is 0.67
------
Iteration 40
Cost: 0.041444
After 40 training step(s), cross entropy on all data is 0.041444, trian accuracy is 1.00, test accuracy is 0.67
------
Iteration 60
Cost: 0.027410
After 60 training step(s), cross entropy on all data is 0.027410, trian accuracy is 1.00, test accuracy is 0.67
------
Iteration 80
Cost: 0.020279
After 80 training step(s), cross entropy on all data is 0.020279, trian accuracy is 1.00, test accuracy is 0.67
------


In [62]:
output_test

array([[ 0.04456415,  0.95543581],
       [ 0.95574987,  0.04425012],
       [ 0.68150365,  0.31849635]], dtype=float32)