## code for sentimental analysis(preprocessing)


In [1]:
import pandas as pd
import jieba
from opencc import OpenCC
import re
from collections import Counter
import numpy as np
import pickle

In [2]:
def remove_char(string):
    return re.sub('[a-zA-Z0-9]','',string)

In [3]:
def t2s(string):
    opencc = OpenCC('t2s')
    return opencc.convert(string)

### word to id,id to word

In [4]:
with open('./movie_comments_data.pkl','rb') as f:
    movie_comments = pickle.load(f)

In [5]:
word_count = Counter()
for s in movie_comments['word_sequence']:
    for w in s.split():
        word_count[w] += 1

In [6]:
word_count

Counter({'吴京': 279,
         '意淫': 281,
         '到': 10373,
         '了': 102381,
         '脑残': 319,
         '的': 328466,
         '地步': 197,
         '，': 353805,
         '看': 34250,
         '恶心': 943,
         '想': 7474,
         '吐': 574,
         '首映礼': 42,
         '。': 219514,
         '太': 13043,
         '恐怖': 599,
         '这个': 10282,
         '电影': 34614,
         '不讲道理': 8,
         '完全': 4152,
         '就是': 14015,
         '在': 31161,
         '实现': 270,
         '他': 10663,
         '小': 6649,
         '粉红': 39,
         '英雄': 1706,
         '梦': 885,
         '各种': 3143,
         '装备': 83,
         '轮番': 21,
         '上场': 17,
         '视': 29,
         '物理': 63,
         '逻辑': 1417,
         '于': 1783,
         '不顾': 57,
         '不得不': 670,
         '说': 11120,
         '有钱': 205,
         '真': 5188,
         '好': 23131,
         '随意': 170,
         '胡闹': 45,
         '炒作': 70,
         '水平': 819,
         '不输': 48,
         '冯小刚': 266,
         '但小刚': 1,
       

In [7]:
print('Total words',len(word_count))

vocab_size = 60000

Total words 125794


In [8]:
id_to_word = {}
id_to_word[0] = '<PAD>'

sort_word_fre = sorted(word_count,key=word_count.get,reverse=True)

for i,w in enumerate(sort_word_fre[:vocab_size],1):id_to_word[i] = w

In [9]:
word_to_id = {w:i for i,w in id_to_word.items()}

In [10]:
len(word_to_id)

60001

### padding

In [11]:
def pad_sequence(word_sequence,maxlength):
    pad_sequence = np.zeros((len(word_sequence),maxlength),dtype=np.int32)
    for ri,row in enumerate(word_sequence):
        for ci,w in enumerate(row.split()[:maxlength]):
            pad_sequence[ri,ci] = word_to_id.get(w,0)
    return pad_sequence

In [12]:
sequence = pad_sequence(movie_comments['word_sequence'],200)

### batches

In [13]:
star = movie_comments['star'].values
star = star.astype(np.int32)
sub_one = np.vectorize(lambda x:x-1)
star = sub_one(star)

In [14]:
u,c = np.unique(star,return_counts=True)

In [15]:
for s,n in zip(u,c):print(s,n/sum(c))

0 0.09393332160584947
1 0.10759711503896839
2 0.2511415175874016
3 0.3204968374035351
4 0.22683120836424545


In [16]:
onehot_star = np.eye(5,dtype=np.int32)[star]

In [17]:
def get_sequence_batches(datasets,labels,batch_size):
    assert datasets.shape[0] == labels.shape[0]
    batch_num = datasets.shape[0] // batch_size
    datasets_cut = datasets[:batch_num*batch_size,:]
    last_datasets = datasets[batch_num*batch_size:,:]
    labels_cut = labels[:batch_num*batch_size,:]
    last_labels = labels[batch_num*batch_size:,:]
    
    for i in range(batch_num+1):
        if i == batch_num:
            #yield last_datasets,last_labels
            break
        x = datasets_cut[i*batch_size:(i+1)*batch_size,:]
        y = labels_cut[i*batch_size:(i+1)*batch_size,:]
        yield x,y

In [18]:
del star

## split the data

In [19]:
split_frac = 0.8

from sklearn.utils import shuffle

sequence,onehot_star = shuffle(sequence,onehot_star,random_state=42)

split_idx = int(len(sequence)*0.8)

train_x, val_x = sequence[:split_idx], sequence[split_idx:]
train_y, val_y = onehot_star[:split_idx], onehot_star[split_idx:]

test_idx = int(len(val_x)*0.5)

val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print('train_x shape:', train_x.shape)
print('val_x shape:', val_x.shape)
print('test_x shape:', test_x.shape)

train_x shape: (209195, 200)
val_x shape: (26149, 200)
test_x shape: (26150, 200)


## hyper params

In [20]:
max_feature = 200
embed_size = 300
lstm_size = 32
lstm_layers = 2
dropout_rate = 0.5
batch_size = 128
learning_rate = 0.001
epochs = 50

## load embeddings

In [21]:
pretrianed_wv = np.load('./pretrained_wv.npy')

print(word_not_in_pretrain)

print('word not in pretrained vocab: %.2f' % (count_not_in_pretrain/vocab_size))

## code for sentimental analysis(construct model keras)

In [36]:
import keras
from keras.layers import Input,Embedding,Dense,CuDNNLSTM,LSTM,Bidirectional,Lambda,dot,Activation,concatenate,Dropout,Masking
from keras.models import Model,Sequential
from keras.optimizers import adam
from keras.initializers import Constant
from keras.callbacks import EarlyStopping
from keras.regularizers import l2
from keras import backend as K

In [37]:
K.clear_session()

In [38]:
callbacks = [EarlyStopping(monitor='val_loss', min_delta=0.00005, patience=3, verbose=0,)]

In [39]:
def attention_3d_blocks(hidden_states,max_feature=200,attention_size=128):
    hidden_size = int(hidden_states.shape[2])
    score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states)
    h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states)
    score = dot([score_first_part, h_t], [2, 1], name='attention_score')
    attention_weights = Activation('softmax', name='attention_weight')(score)
    context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector')
    pre_activation = concatenate([context_vector, h_t], name='attention_output')
    attention_vector = Dense(attention_size, use_bias=False, activation='tanh',name='attention_vector')(pre_activation)

    return attention_vector

In [40]:
inputs = Input(shape=(max_feature,))


weights = Constant(value=pretrianed_wv)
embed_layer = Embedding(vocab_size+1,embed_size,embeddings_initializer=weights)
embed_layer.trainable = True

embed = embed_layer(inputs)
#embed = Masking(mask_value=pretrianed_wv[0])(embed)

biLSTM_layer1 = Bidirectional(CuDNNLSTM(lstm_size,return_sequences=True,kernel_regularizer=l2(0.002),recurrent_regularizer=l2(0.002)))(embed)
biLSTM_layer1 = Dropout(dropout_rate)(biLSTM_layer1)

biLSTM_layer2 = Bidirectional(CuDNNLSTM(lstm_size,return_sequences=True,kernel_regularizer=l2(0.002),recurrent_regularizer=l2(0.002)))(biLSTM_layer1)
biLSTM_layer2 = Dropout(dropout_rate)(biLSTM_layer2)


attention_mul = attention_3d_blocks(biLSTM_layer2)

out_ = Dense(5, activation='softmax')(attention_mul)

In [41]:
sentiment_model = Model(inputs=inputs,outputs=out_)
sentiment_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     18000300    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200, 64)      85504       embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 200, 64)      0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
bidirectio

In [28]:
opt = adam(lr=learning_rate)

In [29]:
sentiment_model.compile(loss='categorical_crossentropy',
                        optimizer=opt,
                        metrics=['accuracy'])

In [30]:
history = sentiment_model.fit(train_x,train_y,batch_size=batch_size,
                             epochs=epochs,
                              callbacks=callbacks,
                             validation_data=(val_x,val_y),
                             verbose=1)

Train on 209195 samples, validate on 26149 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
 13184/209195 [>.............................] - ETA: 29:05 - loss: 0.9681 - acc: 0.6056

KeyboardInterrupt: 

In [31]:
score,acc = sentiment_model.evaluate(test_x,test_y,
                                    batch_size=batch_size)
print('Test score:', score)

print('Test accuracy', acc)

Test score: 1.5843035610136977
Test accuracy 0.5173996175931015


In [32]:
sentiment_model.save_weights('./checkpoints/keras_biatten_w.h5')

## Text CNN model

In [37]:
from keras.layers import Input, Dense, concatenate, Activation
from keras.layers import *
from keras.models import Model

In [39]:
tcnn_inputs = Input(shape=(max_feature,))



weights = Constant(value=pretrianed_wv)
embed_layer = Embedding(vocab_size+1,embed_size,embeddings_initializer=weights)
embed_layer.trainable = True

embed = embed_layer(inputs)

bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(embed)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
bigram_branch = Dropout(dropout_rate)(bigram_branch)

trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(embed)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
trigram_branch = Dropout(dropout_rate)(trigram_branch)

fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(embed)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
fourgram_branch = Dropout(dropout_rate)(fourgram_branch)


merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)
merged = BatchNormalization()(merged)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
outmerge = Dense(5,activation='softmax')(merged)

In [39]:
tcnn_model = Model(inputs=tcnn_inputs,outputs=outmerge)
tcnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
masking_1 (Masking)             (None, 200)          0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     18000300    masking_1[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 199, 100)     60100       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (

In [40]:
opt = adam(lr=learning_rate)

In [41]:
tcnn_model.compile(loss='categorical_crossentropy',
                        optimizer=opt,
                        metrics=['accuracy'])

In [None]:
history_cnn = tcnn_model.fit(train_x,train_y,batch_size=batch_size,
                             epochs=epochs,
                              callbacks=callbacks,
                             validation_data=(val_x,val_y),
                             verbose=1)

Train on 209195 samples, validate on 26149 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50

In [38]:
score,acc = tcnn_model.evaluate(test_x,test_y,
                                    batch_size=batch_size)
print('Test score:', score)

print('Test accuracy', acc)

Test score: 1.9522584938501537
Test accuracy 0.5313575525824016


In [39]:
tcnn_model.save_weights('./checkpoints/keras_tcnn_w.h5')

## code for sentimental analysis(construct model tensorflow)

In [157]:
import tensorflow as tf

In [158]:
tf.reset_default_graph()

In [159]:
keep_prob_rate = 0.8

### inputs

In [160]:
sentiment_graph = tf.Graph()

with sentiment_graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None,None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None,None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

### embedding

In [161]:
with sentiment_graph.as_default():
    embedding = tf.get_variable(name='embedding',shape=pretrianed_wv.shape,
                                initializer=tf.constant_initializer(pretrianed_wv),
                               trainable=False)
    embed_ = tf.nn.embedding_lookup(embedding,inputs_)

### LSTM cell

In [162]:
with sentiment_graph.as_default():
    def build_cell(lstm_size,keep_prob):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size,keep_prob) for _ in range(lstm_layers)])
    
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    outputs , final_state = tf.nn.dynamic_rnn(cell,embed_,
                                             initial_state=initial_state)

### output

In [163]:
with sentiment_graph.as_default():
    logits = tf.contrib.layers.fully_connected(outputs[:,-1],5,activation_fn=tf.sigmoid)
    
    predictions = tf.nn.softmax(logits)
    
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,labels=labels_))
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

### validation accuracy

In [164]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [165]:
with sentiment_graph.as_default():
    correct_pred = tf.equal(tf.argmax(predictions,1),tf.argmax(labels_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

### Training

In [166]:
with sentiment_graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph=sentiment_graph) as sess:
    tf.global_variables_initializer().run()
    iteration = 1
    for e in range(epochs):
        state =  sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_sequence_batches(train_x, train_y, batch_size),1):
            feed = {inputs_:x,
                   labels_:y,
                   keep_prob:0.8,
                   initial_state:state}
            loss, state, _, acc = sess.run([cost,final_state,optimizer,accuracy],feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                     "Iterarion:{}".format(iteration),
                     "Train loss: {:.3f}".format(loss),
                     "Accuracy:{:.4f}".format(acc))
                
            if iteration%25==0:
                val_loss = []
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x,y in get_sequence_batches(val_x, val_y, batch_size):
                    feed = {inputs_:x,
                           labels_:y,
                           keep_prob:1.0,
                           initial_state:val_state}
                    batch_loss,val_state,batch_acc = sess.run([cost,final_state,accuracy],feed_dict=feed)
                    val_acc.append(batch_acc)
                    val_loss.append(batch_loss)
                    
                print("Val loss: {:.3f}".format(np.mean(val_loss)),
                     "Val acc: {:.4f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/20 Iterarion:5 Train loss: 1.521 Accuracy:0.3340
Epoch: 0/20 Iterarion:10 Train loss: 1.520 Accuracy:0.2891
Epoch: 0/20 Iterarion:15 Train loss: 1.522 Accuracy:0.3164
Epoch: 0/20 Iterarion:20 Train loss: 1.508 Accuracy:0.3379
Epoch: 0/20 Iterarion:25 Train loss: 1.518 Accuracy:0.2871
Val loss: 0.323 Val acc: 0.3105
Epoch: 0/20 Iterarion:30 Train loss: 1.505 Accuracy:0.3477
Epoch: 0/20 Iterarion:35 Train loss: 1.497 Accuracy:0.3477
Epoch: 0/20 Iterarion:40 Train loss: 1.496 Accuracy:0.3203
Epoch: 0/20 Iterarion:45 Train loss: 1.512 Accuracy:0.3398
Epoch: 0/20 Iterarion:50 Train loss: 1.504 Accuracy:0.3262
Val loss: 0.323 Val acc: 0.3105
Epoch: 0/20 Iterarion:55 Train loss: 1.521 Accuracy:0.2969
Epoch: 0/20 Iterarion:60 Train loss: 1.515 Accuracy:0.3223
Epoch: 0/20 Iterarion:65 Train loss: 1.498 Accuracy:0.3457
Epoch: 0/20 Iterarion:70 Train loss: 1.536 Accuracy:0.2969
Epoch: 0/20 Iterarion:75 Train loss: 1.518 Accuracy:0.3008
Val loss: 0.323 Val acc: 0.3105
Epoch: 0/20 Iterario

KeyboardInterrupt: 

### Testing

In [None]:
test_loss = []
test_acc= []
with tf.Session(graph=sentiment_graph) as sess:
    saver.restore(sess,tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size,tf.float32))
    for ii, (x, y) in enumerate(get_sequence_batches(test_x, test_y, batch_size),1):
        feed = {inputs_:x,
               labels_:y,
               keep_prob:1,
               initial_state:test_state}
        
        batch_loss,test_state,batch_acc = sess.run([cost,final_state,accuracy],feed_dict=feed)
        
        test_loss.append(batch_loss)
        test_acc.append(batch_acc)
        
    print("Test loss: {:.3f}".format(np.mean(test_loss)),
         "Test accuracy {:.4f}".format(np.mean(test_acc)))