In [2]:
# 首先加载必用的库
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re
import jieba # 结巴分词
# gensim用来加载预训练word vector
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")

In [3]:
# 使用gensim加载预训练中文分词embedding, 有可能需要等待1-2分钟
cn_model = KeyedVectors.load_word2vec_format('embeddings/sgns.zhihu.bigram', 
                                             binary=False, unicode_errors="ignore")

In [4]:
# 只使用前20000个词
num_words = 50000
embedding_dim = 300


# 初始化embedding_matrix，之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words，embedding_dim] 的矩阵
# 维度为 50000 * 300
for i in range(num_words):
    embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]
embedding_matrix = embedding_matrix.astype('float32')

In [5]:
# 用来将tokens转换为文本
def reverse_tokens_(tokens):
    text = ''
    for i in tokens:
        if i != 0:
            text = text + cn_model.index2word[i]
        else:
            text = text + ' '
    return text
def reverse_tokens(tokens_list):
    revers_list = []
    for i in tokens_list:
        text = reverse_tokens_(i)
        revers_list.append(text)
    return revers_list

In [6]:
# 进行分词和tokenize
# train_tokens是一个长长的list，其中含有4000个小list，对应每一条评价
def to_tokens(train_texts_orig):
    train_tokens = []
    for text in train_texts_orig:
        # 去掉标点
        text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
        # 结巴分词
        cut = jieba.cut(text)
        # 结巴分词的输出结果为一个生成器
        # 把生成器转换为list
        cut_list = [ i for i in cut ]
        for i, word in enumerate(cut_list):
            try:
                # 将词转换为索引index
                cut_list[i] = cn_model.vocab[word].index
            except KeyError:
                # 如果词不在字典中，则输出0
                cut_list[i] = 0
        train_tokens.append(cut_list)
    return train_tokens

In [7]:
# ls = to_tokens(['我喜欢这个女孩'])
# print(ls)
# ls = reverse_tokens(ls)
# print(ls)

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

with open(r'测试六类情感与两类情感的映射关系数据emotion/train.txt', encoding='utf8') as fp:
    data_x = []
    data_y = []
    lines = fp.readlines()
    for i in lines:
        i = eval(i)
        data_x.append(i['content'])
        data_y.append(i['label'])

train_tokens = to_tokens(data_x)
lenght = [len(i) for i in train_tokens]

max_tokens = np.mean(lenght) + 2 * np.std(lenght)
max_tokens = int(max_tokens)

train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
                            padding='pre', truncating='pre')
train_pad[ train_pad>=num_words ] = 0
train_pad[0]



Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.173 seconds.
Prefix dict has been built succesfully.


array([    8,    53,   145,  1862,     8, 48860,  7546,   112,  6981,
         193,     1,    42,   254,   938,  4083,    89,   609, 48860,
           4,   167,    24,   502,     1, 48860,  1862,  2345,    42,
           1,   400,   107,     4,    16,     0, 13131,     4,   167,
          24,   502,     1, 13131,   400,  1002,   667,    47,     1,
        1737,  7546,    27, 38895,  4160,    11,    20,     0,     6,
         130,  8742, 34809,     8,  1737,   195,     4,  3800,   377,
       49791,     1,    42,    16,   319,  1261,  3384,  1046])

In [10]:
# 我们使用tensorflow的keras接口来建模
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

In [12]:
path_checkpoint = 'sentiment_checkpoint.keras'
# 尝试加载已训练模型

# 用LSTM对样本进行分类
model = Sequential()
# 模型第一层为embedding
model.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    # input_length=max_tokens,
                    trainable=False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(LSTM(units=32, return_sequences=False))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


try:
    model.load_weights(path_checkpoint)
except Exception as e:
    print(e)

In [13]:
model.compile(loss='binary_crossentropy',
              metrics=['accuracy'])
# 我们来看一下模型的结构，一共90k左右可训练的变量
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 128)         186880    
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 15,208,033
Trainable params: 208,033
Non-trainable params: 15,000,000
____________________________________

In [14]:
result = model.predict(train_pad)
result

array([[0.06481326],
       [0.51183647],
       [0.34630623],
       ...,
       [0.46879902],
       [0.07427481],
       [0.57198983]], dtype=float32)

In [16]:
# 输出
data_x_reverse = reverse_tokens(train_pad)


In [27]:
i = 208
print('标签:', data_y[i])
print('原句:', data_x[i])
print('复原:', data_x_reverse[i])
print('概率:', result[i, 0])
len(data_x)

标签: surprise
原句: 刚才同事给男朋友打电话说：今天愚人节，咱们晚上去吃鱼吧！
复原:                                                         刚才同事给男朋友打电话说：今天愚人节咱们晚上去吃鱼吧
概率: 0.31129318


31728

In [26]:
unique_label = set(data_y)

dic = {k:[] for k in unique_label}
for label in unique_label:
    for i, v in enumerate(data_y):
        if label == v:
            dic[label].append(i)
print(dic['surprise'])

[22, 31, 33, 101, 183, 205, 208, 216, 220, 283, 350, 448, 498, 605, 612, 619, 665, 683, 729, 739, 788, 795, 837, 881, 904, 911, 940, 1027, 1108, 1137, 1233, 1346, 1404, 1414, 1426, 1430, 1443, 1546, 1650, 1677, 1689, 1710, 1820, 1825, 1831, 1843, 1848, 1863, 1896, 1913, 1929, 2040, 2062, 2113, 2117, 2179, 2229, 2381, 2392, 2411, 2435, 2469, 2487, 2488, 2506, 2508, 2514, 2656, 2707, 2833, 2836, 2879, 2900, 2905, 2912, 2981, 2995, 2999, 3010, 3014, 3077, 3086, 3127, 3142, 3158, 3175, 3217, 3222, 3238, 3252, 3263, 3346, 3440, 3496, 3528, 3586, 3641, 3716, 3756, 3770, 3810, 3832, 3839, 3899, 3974, 3990, 4020, 4137, 4142, 4155, 4198, 4221, 4226, 4228, 4232, 4298, 4344, 4349, 4374, 4564, 4606, 4623, 4628, 4678, 4727, 4837, 4884, 4941, 5000, 5037, 5102, 5119, 5143, 5151, 5222, 5234, 5275, 5368, 5444, 5457, 5488, 5538, 5648, 5722, 5758, 5763, 5782, 5784, 5838, 5852, 5870, 5893, 5917, 5999, 6013, 6067, 6083, 6094, 6169, 6175, 6204, 6205, 6248, 6260, 6267, 6287, 6292, 6298, 6366, 6400, 6413, 643

In [22]:
dic_pos = {k:0. for k in unique_label}
dic_neg = {k:0. for k in unique_label}
for label in unique_label:
    idx = dic[label]
    temp_lenght = len(idx)
    temp_p = result[idx]
    temp_i = np.where(temp_p >= 0.5)[0].shape[0]
    pos_rate = temp_i / temp_lenght
    dic_pos[label] = pos_rate
    dic_neg[label] = 1 - pos_rate

for (k1, v1), (k2, v2) in zip(dic_pos.items(), dic_neg.items()):
    print()
    v1 = np.round(v1 * 100, 2)
    v2 = np.round(v2 * 100, 2)
    print('{}\n    pos: {}%\n    neg: {}%'.format(k1, v1, v2))



surprise
    pos: 26.86%
    neg: 73.14%

like
    pos: 42.44%
    neg: 57.56%

happiness
    pos: 49.74%
    neg: 50.26%

sadness
    pos: 21.58%
    neg: 78.42%

anger
    pos: 16.57%
    neg: 83.43%

disgust
    pos: 22.13%
    neg: 77.87%

fear
    pos: 19.81%
    neg: 80.19%
