# 使用tensorflow构建推特评论分类系统

## 1.下载数据集

In [1]:
import os
import urllib2

dataset_url = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'

print 'downloading dataset...'
filename = 'trainingandtestdata.zip'
content = urllib2.urlopen(dataset_url).read()
open(filename, 'wb').write(content)

print 'finish, amount {} bytes!'.format(os.path.getsize(filename))

downloading dataset...
finish, amount 81363704 bytes!


## 2.数据集预处理

In [41]:
import csv
import codecs

# 选取所需的字段
def preprocess(input_file, output_file):
    
    lines = []
    with open(input_file, 'r') as fr:
        for idx, line in enumerate(fr):
            line = line.replace('"', '')
            items = line.split(',')
            attitude, tweet = items[0], ','.join(items[5:])
            lines.append(attitude+':%:%:%:'+tweet)
    print '{0} has {1} lines.'.format(output_file, len(lines))
    
    with open(output_file, 'w') as fw:
        fw.writelines(lines)

preprocess('./training.1600000.processed.noemoticon.csv', 'train.txt')
preprocess('./testdata.manual.2009.06.14.csv', 'test.txt')

train.txt has 1600000 lines.
test.txt has 498 lines.


## 3. 创建词汇表

In [None]:
import codecs
import cPickle as pickle
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

data_file = './train.txt'

def create_dictionary(data_file, start_rate=0.2, size=112):
    
    all_words = []
    lemmatizer = WordNetLemmatizer()
    
    with open(data_file) as fr:
        for idx, line in enumerate(fr):
            items = line.split(':%:%:%:')
            if len(items) < 2:
                continue
            words = word_tokenize(items[1].decode('latin-1').lower())
            words = [lemmatizer.lemmatize(word) for word in words]
            all_words += words
    
    print '{0} has {1} lines, {2} words.'.format(data_file, idx+1, len(all_words))
    
    dict = []
    cnt = Counter(all_words).most_common()
    for idx, (word, freq) in enumerate(cnt[int(len(cnt)*start_rate):]):
        if idx >= size:
            break
        dict.append(word)
            
    print 'dict size {}'.format(len(dict))
    return dict

dict = create_dictionary(data_file, start_rate=0.2, size=112)
fp = open('tweet_dict.pkl', 'wb')
pickle.dump(dict, fp)
fp.close()

## 4.生成数据

In [2]:
import codecs
import random
import numpy as np
import cPickle as pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

dict = pickle.load(open('tweet_dict.pkl', 'r'))
print len(dict)
def next_batch(data, batch_size):
    
    def line2vec(dict, line):
        items = line.split(':%:%:%:')
        if len(items) < 2:
            return None
        attitude, tweet = items
        words = word_tokenize(tweet.decode('latin-1').lower())
        words = [lemmatizer.lemmatize(word) for word in words]
        
        if attitude == '0':
            cls = [0, 0, 1]
        elif attitude == '2':
            cls = [0, 1, 0]
        elif attitude == '4':
            cls = [1, 0, 0]
        else:
            cls = [0, 0, 0]
        
        features = np.zeros(len(dict))
        for word in words:
            if word in dict:
                features[dict.index(word)] = 1
        return [features, cls]
    
    lemmatizer = WordNetLemmatizer()
    samples = random.sample(data, batch_size)
    batch = [line2vec(dict, line) for line in samples]
    return np.array(batch)

fp = open('./train.txt')
train_data = fp.readlines()
fp.close()

112


## 5.定义前馈(feed forward)神经网络训练推特评论数据

### 5.0引入依赖包

In [2]:
import tensorflow as tf

### 5.1定义神经网络参数

In [3]:
# 定义每一层神经元的个数
"""
层数的选择：线性数据使用1层，非线性数据使用2册, 超级非线性使用3+册。层数／神经元过多会导致过拟合
"""
n_input_layer  = len(dict) # 输入层
n_hidden_layer_1 = 30 # hidden layer 1
n_hidden_layer_2 = 40 # hidden layer 2
n_output_layer = 3 # 输出层

W_xh = tf.Variable(tf.random_normal([n_input_layer, n_hidden_layer_1]))
b_h1  = tf.Variable(tf.random_normal([n_hidden_layer_1]))

W_hh = tf.Variable(tf.random_normal([n_hidden_layer_1, n_hidden_layer_2]))
b_h2  = tf.Variable(tf.random_normal([n_hidden_layer_2]))

W_ho = tf.Variable(tf.random_normal([n_hidden_layer_2, n_output_layer]))
b_o  = tf.Variable(tf.random_normal([n_output_layer]))

batch_size = 100

# 定义实际输入和输出数据
X = tf.placeholder('float', [None, len(dict)])
Y = tf.placeholder('float', [None, n_output_layer])

### 5.2定义网络模型

In [4]:
def NeuralNetwork(x):
    
    hidden_layer_1_output = tf.matmul(x, W_xh) + b_h1
    hidden_layer_1_activate = tf.nn.sigmoid(hidden_layer_1_output)  # 激活函数
    
    hidden_layer_2_output = tf.matmul(hidden_layer_1_activate, W_hh) + b_h2
    hidden_layer_2_output = tf.nn.sigmoid(hidden_layer_2_output)
    
    output = tf.matmul(hidden_layer_2_output, W_ho) + b_o
    output = tf.nn.softmax(output)
    
    return output

### 5.3训练

In [None]:
import numpy as np

def train_neural_network(x, y):
    
    predict = NeuralNetwork(x)
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y*tf.log(predict), reduction_indices=[1]))
    train_step    = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
    # AdamOptimizer
    
    saver = tf.train.Saver()
    init  = tf.global_variables_initializer()
    
    fp = open('./train.txt')
    train_data = fp.readlines()
    fp.close()
    
    fp = open('./test.txt')
    test_data = fp.readlines()
    fp.close()
    
    test_set = next_batch(test_data, len(test_data))
    test_x = test_set[:, 0].tolist()
    test_y = test_set[:, 1].tolist()
    
    pre_accuracy = 0.0
    with tf.Session() as sess:
        sess.run(init)
        i = 0
        while True:
            mini_batch = next_batch(train_data, batch_size)
            batch_x = mini_batch[:, 0].tolist()
            batch_y = mini_batch[:, 1].tolist()
            
            sess.run(train_step, feed_dict={x: batch_x, y: batch_y})

            if i > 100:
                correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
                result = tf.reduce_mean(tf.cast(correct, 'float'))
                accuracy = sess.run(result, feed_dict={x: test_x, y: test_y})
                # accuracy = accuracy.eval({X:test_x, Y:test_y})
                if accuracy > pre_accuracy:  # 保存准确率最高的训练模型
                    print '最佳准确率: ', accuracy
                    pre_accuracy = accuracy
                    saver.save(sess, 'model.ckpt')  # 保存session
                i = 0
            i += 1

train_neural_network(X, Y)

最佳准确率:  0.365462
最佳准确率:  0.36747
最佳准确率:  0.375502
最佳准确率:  0.383534
最佳准确率:  0.395582
最佳准确率:  0.405622
最佳准确率:  0.411647
最佳准确率:  0.423695
最佳准确率:  0.427711
最佳准确率:  0.429719
最佳准确率:  0.431727
最佳准确率:  0.437751
最佳准确率:  0.441767
最佳准确率:  0.449799
最佳准确率:  0.455823
最佳准确率:  0.463855
最佳准确率:  0.465863
最佳准确率:  0.467871
最佳准确率:  0.477912
最佳准确率:  0.481928
最佳准确率:  0.483936
最佳准确率:  0.485944
最佳准确率:  0.491968
最佳准确率:  0.493976
最佳准确率:  0.497992
最佳准确率:  0.5
最佳准确率:  0.504016


### 5.4重用模型

In [30]:
def predict(tweet):
    
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(tweet.lower())
    words = [lemmatizer.lemmatize(word) for word in words]
    input = np.zeros(len(dict))
    for word in words:
        if word in dict:
            input[dict.index(word)] = 1
    
    input = input.reshape(1, -1)
    X = tf.placeholder('float', [None, len(dict)])
    output = NeuralNetwork(X)
    
    saver = tf.train.Saver()
    init  = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        saver.restore(sess, './model.ckpt')
        # prediction = sess.run(output, feed_dict={X: input})
        prediction = tf.argmax(output.eval(feed_dict={X: input}), 1)
    
        if prediction == 2:
            attitude = 'good'
        elif prediction == 1:
            attitude = 'just so so.'
        else:
            attitude = 'bad'
        
        print attitude

predict('it is very good.')

bad


## 6.定义CNN(卷积神经网络)训练推特评论数据

### 6.1定义网络参数

In [3]:
import numpy as np
import tensorflow as tf
sess = tf.InteractiveSession()

input_size  = len(dict)

num_classes = 3
batch_size  = 100

embedding_size = input_size   # the input X's embedding size
filter_conf = [(5, 5, 1, 32), (5, 5, 32, 64)] # [(size, in_channel, output_channel)]

X = tf.placeholder(tf.int32, [None, input_size])
Y = tf.placeholder(tf.float32, [None, num_classes])
keep_dropout_prob = tf.placeholder(tf.float32)

### 6.2定义CNN模型

In [4]:
def weight_variable(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1))

def bias_variable(shape):
    return tf.Variable(tf.constant(0.1, shape=shape))

def conv2d(x, W, bias):
    conv = tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
    return tf.nn.relu(conv + bias)


def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

def ConvNeuralNetwork(x):
    
    # 1.embedding layer
    with tf.device('/cpu:0'), tf.name_scope('embedding'):
        Vec_Table = tf.Variable(tf.random_uniform([input_size, embedding_size], -1.0, 1.0))
        embeded_vec = tf.expand_dims(tf.nn.embedding_lookup(Vec_Table, x), -1)
    
    # 2.convolution + maxpool layer
    pool_layer = tf.reshape(embeded_vec, [-1, input_size, embedding_size, 1])
    for idx, (filter_size, filter_size, in_channel, output_channel) in enumerate(filter_conf):
        with tf.name_scope('conv_maxpool_layer_%d' % idx):
            filter_shape = [filter_size, filter_size, in_channel, output_channel]
            W = weight_variable(filter_shape)
            b = bias_variable([output_channel])
            conv_layer  = conv2d(pool_layer, W, b)
            pool_layer  = max_pool_2x2(conv_layer)

    # 3.fully-connected layer
    W_fc1 = weight_variable([28*28*64, 1024])
    b_fc1 = bias_variable([1024])
    pool_flat = tf.reshape(pool_layer, [-1, 28*28*64])
    fc_output = tf.nn.relu(tf.matmul(pool_flat, W_fc1) + b_fc1)
    
    # 4.dropout layer
    drop_output = tf.nn.dropout(fc_output, keep_dropout_prob)
    
    # 5.readout layer
    W_fc2 = weight_variable([1024, num_classes])
    b_fc2 = bias_variable([num_classes])
    output = tf.nn.softmax(tf.matmul(drop_output, W_fc2) + b_fc2)
  
    return output


### 6.3训练网络

In [None]:
import numpy as np

def train_neural_network(x, y):
    
    predict = ConvNeuralNetwork(x)
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y*tf.log(predict), reduction_indices=[1]))
    # cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=predict)) # do not need tf.nn.softmax in the Readout Layer

    train_step    = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
    # AdamOptimizer
    
    saver = tf.train.Saver()
    init  = tf.global_variables_initializer()
    
    fp = open('./train.txt')
    train_data = fp.readlines()
    fp.close()
    
    fp = open('./test.txt')
    test_data = fp.readlines()
    fp.close()
    
    test_set = next_batch(test_data, len(test_data))
    test_x = test_set[:, 0].tolist()
    test_y = test_set[:, 1].tolist()
    
    pre_accuracy = 0.0
    sess.run(init)
    i = 0
    while True:
        mini_batch = next_batch(train_data, batch_size)
        batch_x = mini_batch[:, 0].tolist()
        batch_y = mini_batch[:, 1].tolist()

        sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_dropout_prob: 0.5})

        if i > 100:
            correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
            result = tf.reduce_mean(tf.cast(correct, 'float'))
            accuracy = sess.run(result, feed_dict={x: test_x, y: test_y, keep_dropout_prob: 1.0})
            print '当前准确率：', accuracy
            # accuracy = accuracy.eval({X:test_x, Y:test_y})
            if accuracy > pre_accuracy:  # 保存准确率最高的训练模型
                print '最佳准确率: ', accuracy
                pre_accuracy = accuracy
                saver.save(sess, 'model.ckpt')  # 保存session
            i = 0
        i += 1

train_neural_network(X, Y)

当前准确率： 0.365462
最佳准确率:  0.365462
当前准确率： 0.365462
当前准确率： 0.365462
当前准确率： 0.365462
