In [1]:
import os
import re
import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import confusion_matrix
# import matplotlib.pyplot as plt
import tensorflow as tf



In [2]:
## 加载数据集
def load_dataset(name, nrows=None):
    datasets = {
        'labeled_train': 'train_first.csv',
        'test': 'predict_first.csv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'ccf_data', datasets[name])
    df = pd.read_csv(data_file, sep=',', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

In [3]:
df1 = load_dataset('labeled_train').Discuss
df2 = load_dataset('test').Discuss
df_unlabeled = df1.append(df2)
df_unlabeled.index = range(len(df_unlabeled)) # 重新整理下index
print(df_unlabeled.shape)

Number of reviews: 100000
Number of reviews: 30000
(130000,)


## 2.读入之前的word2vec模型

In [4]:
## .model文件
model_name = '200features_5minwords_5context.model'
model = Word2Vec.load(os.path.join('..', 'ccf_models', model_name))

In [5]:
# 这个model是之前用gensim包生成的word2vec(词频小于5的都过滤掉了)
embedding_matrix = model.wv.vectors
embedding_matrix.shape

(28539, 200)

## 3.用RNN构建分类器 [这里用的是Tensorflow LSTM模型]

In [6]:
# Start a graph
sess = tf.Session()

# Set RNN parameters
epochs = 10
batch_size = 5000
max_sequence_length = 180
rnn_size = 100
lstmUnits = 100
numClasses = 5
embedding_size = 200
VALIDATION_SPLIT = 0.75
min_word_frequency = 5
learning_rate = 0.001
dropout_keep_prob = tf.placeholder(tf.float32)

### 将gensim生成的word2vec变成字典的形式 比如{'不错':[0.02,0.1,....0.09]}

In [7]:
word_vec = {}
for word in model.wv.vocab:
    word_vec[word] = model[word]

  app.launch_new_instance()


In [8]:
## 然后把改字典处理一下，生成tensorflow需要的样子
# vocab为词列表（按顺序）
# embed为vocab列表中每个词对应的词向量
vocab = []
embed = []
for key,value in word_vec.items():
    vocab.append(key)
    embed.append(value)
vocab = np.asarray(vocab)
embedding = np.asarray(embed,dtype='float32')

In [9]:
####### 这里embedding头部在加一行，和keras不同，这里索引0也得自己定义词向量
## 刚开始后面编译的时候会报错，因为embedding_mat是float64而不是float32 !!!!!!
embedding = np.row_stack((np.random.uniform(low=-1,high=1,size=200),embedding))
embedding = np.asarray(embedding,dtype='float32')

### 数据准备

In [10]:
texts = []
labels = []
def split_clean_text2(text, remove_stopwords=False):
    text = re.sub(r'。|！|？|～',' ',text.lower().strip())
    text = re.sub(r'good|nice|excellent|beautiful','不错',text)
#     text = BeautifulSoup(text, 'html.parser').get_text()
    words = list(jieba.cut(text))
    words = [re.sub(r'[^0-9\u4E00-\u9FA5]+','',s1).replace(' ','') for s1 in words]
    words = [s2 for s2 in words if s2]
    if remove_stopwords:
        words = [w for w in words if w not in chn_stopwords]
    return ' '.join(words)

In [11]:
# texts和labels准备 texts包含测试数据转索引使用
# 这里的texts包含了训练和测试数据，每句话转成索引以后，再将训练和测试切分开
texts = df_unlabeled.apply(split_clean_text2)
df1_score = load_dataset('labeled_train').Score
df1_score = df1_score - 1 # tricky
labels = np.array(pd.get_dummies(df1_score))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\16121360\AppData\Local\Temp\11\jieba.cache
Loading model cost 1.097 seconds.
Prefix dict has been built succesfully.


Number of reviews: 100000


In [12]:
# 跑二分类的时候用
# labels = np.where(df1_score > 2,1,0)
# 跑多分类的时候用
labels = np.array(df1_score)

### 将每句话的单词变成索引

In [13]:
# Change texts into numeric vectors
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sequence_length)
# text_processed = np.array(list(vocab_processor.fit_transform(texts)))
# fit the vocab from gensim
pretrain = vocab_processor.fit(vocab)
# transform inputs
text_processed = np.array(list(vocab_processor.transform(texts)))

In [14]:
# 训练和测试转成索引以后进行分割
x_train_data = text_processed[:100000]
x_test = text_processed[100000:]

In [15]:
# Shuffle and split data
shuffled_ix = np.random.permutation(np.arange(len(labels)))
x_shuffled = x_train_data[shuffled_ix]
y_shuffled = labels[shuffled_ix]

# Split train/test set
ix_cutoff = int(len(y_shuffled)*VALIDATION_SPLIT)
x_train, x_val = x_shuffled[:ix_cutoff], x_shuffled[ix_cutoff:]
y_train, y_val = y_shuffled[:ix_cutoff], y_shuffled[ix_cutoff:]
vocab_size = len(vocab_processor.vocabulary_)
print("Vocabulary Size: {:d}".format(vocab_size))
print("75-25 Train Test split: {:d} -- {:d}".format(len(y_train), len(y_val)))
del x_shuffled
del y_shuffled

Vocabulary Size: 28540
75-25 Train Test split: 75000 -- 25000


In [16]:
# Create placeholders
x_data = tf.placeholder(tf.int32, [None, max_sequence_length])
# y_output = tf.placeholder(tf.int32, [None,y_train.shape[1]]) # 这里y_train.shape[1]=5指的是label有5类，是经过one-hot的
y_output = tf.placeholder(tf.int32, [None])

In [17]:
## 将embedding加载到tf中
embedding_mat = tf.Variable(embedding)
embedding_output = tf.nn.embedding_lookup(embedding_mat, x_data)

In [48]:
# Define the BasicRNN cell 二分类
#tensorflow change >= 1.0, rnn is put into tensorflow.contrib directory. Prior version not test.
if tf.__version__[0]>='1':
    cell=tf.contrib.rnn.BasicRNNCell(num_units = rnn_size)
else:
    cell = tf.nn.rnn_cell.BasicRNNCell(num_units = rnn_size)
output, state = tf.nn.dynamic_rnn(cell, embedding_output, dtype=tf.float32)
output = tf.nn.dropout(output, dropout_keep_prob)

# Get output of RNN sequence 
# 下面两行代码的意思先进行一个dimension的转换，然后一句话最后一个词那里进行输出，比如max_sequence_length=25那么第25个的时候（索引是24）进行输出y_，
output = tf.transpose(output, [1, 0, 2])
last = tf.gather(output, int(output.get_shape()[0]) - 1)
#-------------------------------------------------------

weight = tf.Variable(tf.truncated_normal([rnn_size, 2], stddev=0.1)) # 二分类
bias = tf.Variable(tf.constant(0.1, shape=[2]))  # 二分类
logits_out = tf.matmul(last, weight) + bias

# Loss function 二分类
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_out, labels=y_output) # logits=float32, labels=int32
loss = tf.reduce_mean(losses)

accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits_out, 1), tf.cast(y_output, tf.int64)), tf.float32))

optimizer = tf.train.RMSPropOptimizer(learning_rate)
train_step = optimizer.minimize(loss)

In [18]:
# Define the BasicLSTM cell 多分类
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.25)
output, state = tf.nn.dynamic_rnn(lstmCell, embedding_output, dtype=tf.float32)

# Get output of RNN sequence 
# 下面两行代码的意思先进行一个dimension的转换，然后一句话最后一个词那里进行输出，比如max_sequence_length=25那么第25个的时候（索引是24）进行输出y_，
output = tf.transpose(output, [1, 0, 2])   # 维度(max_sequence_length,?,rnn_size)
last = tf.gather(output, int(output.get_shape()[0]) - 1) # 维度(?,rnn_size)
#-------------------------------------------------------

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))  # 维度 (rnn_size,num_classes)
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))  # 维度 (num_classes,)
logits_out = (tf.matmul(last, weight) + bias)

# Loss function 多分类
# 注: tf.nn.sparse_softmax_cross_entropy_with_logits这个函数输入的labels [5,4,1,2,...,5,4,3]是没有one-hot的，函数本身会进行one-hot
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_out, labels=y_output) # logits=float32, labels=int32
loss = tf.reduce_mean(losses)

accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits_out, 1), tf.cast(y_output, tf.int64)), tf.float32))

optimizer = tf.train.RMSPropOptimizer(learning_rate)
train_step = optimizer.minimize(loss)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [19]:
# 开始跑tensorflow
init = tf.global_variables_initializer()
sess.run(init)

train_loss = []
val_loss = []
train_accuracy = []
val_accuracy = []

for epoch in range(epochs):

    # Shuffle training data
    shuffled_ix = np.random.permutation(np.arange(len(x_train)))
    x_train = x_train[shuffled_ix]
    y_train = y_train[shuffled_ix]
    num_batches = int(len(x_train)/batch_size) + 1
    # TO DO CALCULATE GENERATIONS ExACTLY
    for i in range(num_batches):
        # Select train data
        min_ix = i * batch_size
        max_ix = np.min([len(x_train), ((i+1) * batch_size)])
        x_train_batch = x_train[min_ix:max_ix]
        y_train_batch = y_train[min_ix:max_ix]
        
        # Run train step
        train_dict = {x_data: x_train_batch, y_output: y_train_batch, dropout_keep_prob:0.5}
        sess.run(train_step, feed_dict=train_dict)
        
    # Run loss and accuracy for training
    temp_train_loss, temp_train_acc = sess.run([loss, accuracy], feed_dict=train_dict)
    train_loss.append(temp_train_loss)
    train_accuracy.append(temp_train_acc)
    
    # Run Eval Step
    val_dict = {x_data: x_val, y_output: y_val, dropout_keep_prob:1.0}
    temp_val_loss, temp_val_acc = sess.run([loss, accuracy], feed_dict=val_dict)
    val_loss.append(temp_val_loss)
    val_accuracy.append(temp_val_acc)
    print('Epoch: {}, val Loss: {:.2}, val Acc: {:.2}'.format(epoch+1, temp_val_loss, temp_val_acc))

Epoch: 1, val Loss: 2.2, val Acc: 0.45
Epoch: 2, val Loss: 1.9, val Acc: 0.55
Epoch: 3, val Loss: 1.4, val Acc: 0.41
Epoch: 4, val Loss: 1.5, val Acc: 0.58
Epoch: 5, val Loss: 1.2, val Acc: 0.39
Epoch: 6, val Loss: 1.1, val Acc: 0.59
Epoch: 7, val Loss: 1.0, val Acc: 0.56
Epoch: 8, val Loss: 1.0, val Acc: 0.59
Epoch: 9, val Loss: 1.0, val Acc: 0.57
Epoch: 10, val Loss: 1.0, val Acc: 0.57


## 4.根据训练好的tensorflow模型进行预测 以下两种任意一种都可以

In [20]:
pred1 = sess.run(logits_out,feed_dict={x_data: x_test, dropout_keep_prob:1.0})
# pred2 = sess.run(tf.nn.softmax(logits_out),feed_dict={x_data: x_test, dropout_keep_prob:1.0})

In [26]:
c = pd.DataFrame(pred1)
d = c.apply(np.argmax,axis=1)

In [30]:
d.value_counts()

4    27340
3     2646
2       14
dtype: int64

In [31]:
df_tmp = load_dataset('test')
output = pd.DataFrame({'Id':df_tmp.Id,'Score':d})
output.to_csv(os.path.join('..', 'ccf_data', 'submit.csv'),sep=',',index=False,header=False)

Number of reviews: 30000
