# 使用tensorflow构建京东评论分类系统

## 1.爬取淘宝评论数据

### 1.1爬取代码

In [19]:
import json
import requests

# get the comments
def get_comments(url):
    comments = []
    resp = requests.get(url)
    resp.encoding = 'gbk'
    if resp.status_code != 200:
        return []
    content = resp.text
    if content:
        comment_infos = json.loads(content[content.find('(')+1:-2])['comments']
        for comment_info in comment_infos:
            comment_content = comment_info['content']
            comments.append(comment_content.encode('utf-8')+'\n')
    return comments

# save comments
def save_comments(comments, type='good'):
    with open(type+'.txt', 'w') as fw:
        fw.writelines(comments)

### 1.2好评论

In [46]:
good_comments = []
good_comment_url_template = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv8914&productId=10359162198&score=3&sortType=5&page={}&pageSize=10&isShadowSku=0'

# good comments
for i in range(1000):
    url = good_comment_url_template.format(i)
    good_comments += get_comments(url)
save_comments(good_comments, type='good')

### 1.3坏评论

In [50]:
bad_comments = []
bad_comment_url_templates = [
    'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv8914&productId=10359162198&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0',
    'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv73&productId=10968941641&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0',
    'http://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4653&productId=10335204102&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0',
    'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv1&productId=1269194114&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0',
    'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv2777&productId=1409704820&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0',
    'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv174&productId=10103790891&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0',
    'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv9447&productId=1708318938&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0',
    'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv111&productId=10849803616&score=1&sortType=5&page={}&pageSize=10&isShadowSku=0'
]

# bad comments
for bad_comment_url_template in bad_comment_url_templates:
    for i in range(80):
        url = bad_comment_url_template.format(i)
        bad_comments += get_comments(url)
    save_comments(bad_comments, type='bad')

## 2.数据整理

### 2.1创建词典

In [1]:
import jieba
from collections import Counter

good_file = './good.txt'
bad_file  = './bad.txt'

def create_dictionary(good_file, bad_file, lower=50, upper=1000):
    
    all_words = []
    with open(good_file, 'r') as fr:
        for idx, line in enumerate(fr):
            words = jieba.lcut(line)
            all_words += words
    print '{0} has {1} lines, {2} words.'.format(good_file, idx+1, len(all_words))
    
    count = len(all_words)
    with open(bad_file, 'r') as fr:
        for idx, line in enumerate(fr):
            words = jieba.lcut(line)
            all_words += words
    print '{0} has {1} lines, {2} words.'.format(bad_file, idx+1, len(all_words)-count)
    
    dict = []
    cnt = Counter(all_words)
    for word, freq in cnt.iteritems():
        if lower <= freq <= upper:
            dict.append(word)
    print 'dict size {}'.format(len(dict))
    return dict

dict = create_dictionary(good_file, bad_file)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.072 seconds.
Prefix dict has been built succesfully.


./good.txt has 4450 lines, 55390 words.
./bad.txt has 4345 lines, 64872 words.
dict size 272


### 2.2词袋模型

In [2]:
import numpy as np
import cPickle as pickle

def wordbag():
    
    def line2vec(dict, line, cls):
        words = jieba.lcut(line)
        features = np.zeros(len(dict))
        for word in words:
            if word in dict:
                features[dict.index(word)] = 1
        return [features, cls]
    
    dataset = []
    with open('good.txt', 'r') as fr:
        for line in fr:
            dataset.append(line2vec(dict, line, [0, 1]))
    print 'positive dataset size {}'.format(len(dataset))
   
    count = len(dataset)
    
    with open('bad.txt', 'r') as fr:
        for line in fr:
            dataset.append(line2vec(dict, line, [1, 0]))
    print 'negtive dataset size {}'.format(len(dataset)-count)
    
    print 'dataset length {}'.format(len(dataset))
    
    return dataset

dataset = wordbag()
# print dataset[1]

np.random.shuffle(dataset)

fp = open('comment_data.pkl', 'wb')
dataset = np.array(dataset)
pickle.dump(dataset, fp)

train_size = int(dataset.shape[0]*0.8)
train_set  = dataset[:train_size]
test_set   = dataset[train_size:]

print 'train_set size {}'.format(train_set.shape[0])
print 'test_set size {}'.format(test_set.shape[0])

positive dataset size 4450
negtive dataset size 4345
dataset length 8795
train_set size 7036
test_set size 1759


## 3.定义前馈(feed forward)神经网络训练评论数据

### 3.0引入依赖包

In [3]:
import tensorflow as tf

### 3.1定义神经网络参数

In [4]:
# 定义每一层神经元的个数
"""
层数的选择：线性数据使用1层，非线性数据使用2册, 超级非线性使用3+册。层数／神经元过多会导致过拟合
"""
n_input_layer  = len(dict) # 输入层
n_hidden_layer_1 = 30 # hidden layer 1
n_hidden_layer_2 = 20 # hidden layer 2
n_output_layer = 2 # 输出层

W_xh = tf.Variable(tf.random_normal([n_input_layer, n_hidden_layer_1]))
b_h1  = tf.Variable(tf.random_normal([n_hidden_layer_1]))

W_hh = tf.Variable(tf.random_normal([n_hidden_layer_1, n_hidden_layer_2]))
b_h2  = tf.Variable(tf.random_normal([n_hidden_layer_2]))

W_ho = tf.Variable(tf.random_normal([n_hidden_layer_2, n_output_layer]))
b_o  = tf.Variable(tf.random_normal([n_output_layer]))

batch_size = 100

# 定义实际输入和输出数据
X = tf.placeholder('float', [None, len(dict)])
Y = tf.placeholder('float', [None, 2])

### 3.2定义神经网络模型

In [5]:
def neural_network(x):
    
    hidden_layer_1_output = tf.matmul(x, W_xh) + b_h1
    hidden_layer_1_activate = tf.nn.sigmoid(hidden_layer_1_output)  # 激活函数
    
    hidden_layer_2_output = tf.matmul(hidden_layer_1_activate, W_hh) + b_h2
    hidden_layer_2_output = tf.nn.sigmoid(hidden_layer_2_output)
    
    output = tf.matmul(hidden_layer_2_output, W_ho) + b_o
    output = tf.nn.softmax(output)
    return output

### 3.3训练

In [6]:
def train_neural_network(x, y):
    
    predict       = neural_network(x)
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y*tf.log(predict), reduction_indices=[1]))
    train_step    = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy) # 学习率改为0.1，加速梯度下降
    init          = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)
        epochs = 5000
        np.random.shuffle(train_set)
        
        train_x = train_set[:, 0].tolist()
        train_y = train_set[:, 1].tolist()

        test_x = test_set[:, 0].tolist()
        test_y = test_set[:, 1].tolist()

        for epoch in range(epochs):
            i  = 0
            while i < len(train_x):
                start = i
                end   = i + batch_size
                batch_x, batch_y = train_x[start:end], train_y[start:end]
                loss = sess.run(train_step, feed_dict={x: batch_x, y: batch_y})
                i += end
                
        correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print '准确率：', sess.run(accuracy, feed_dict={x: test_x, y: test_y})

train_neural_network(X, Y)

准确率： 0.807277
