In [44]:
### 在10w中文微博数据上实现情感的二分类任务
###  2019 - 11-19
## LIu Yazhou

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re
import jieba # 结巴分词
# gensim用来加载预训练word vector
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")
# 用来解压
import bz2
# 我们使用tensorflow的keras接口来建模
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

In [2]:
def get_cn_model(): 
    cn_model = KeyedVectors.load_word2vec_format('embeddings/sgns.zhihu.bigram', 
                                             binary=False, unicode_errors="ignore")
    
    return cn_model


In [3]:
def max_token(train_tokens):
    num_tokens = [ len(tokens) for tokens in train_tokens ]
    num_tokens = np.array(num_tokens)
    max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
    max_tokens = int(max_tokens)
    return max_tokens

In [4]:
def vocab_size(train_tokens):
    vocab_tokens = []
    i = 0
    for tokens in train_tokens:
        for token in tokens:
            i+=1
            if token not in vocab_tokens:
                vocab_tokens.append(token)
            else:
                pass
    if i%1e6 == 0:
        print("处理前%d的词花费的时间是:%0.2f"%(i,(time.time() - t1)/60),'mins')
    print("词汇表大小是：",len(vocab_tokens))
    return len(vocab_tokens)

In [5]:
def get_embedding_matrix(embedding_dim,num_words,cn_model):
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for i in range(num_words):
        embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]
    embedding_matrix = embedding_matrix.astype('float32')
    return embedding_matrix

In [6]:
def add_padding(train_tokens,train_target,max_tokens,num_words):
    train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
                            padding='pre', truncating='pre')
    train_pad[ train_pad>=num_words ] = 0
    train_targets = np.array(train_target)
    return train_pad,train_targets

In [7]:
def model(epochs,batch_size):
    model = Sequential()
    model.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_tokens,
                    trainable=False))
    model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
    model.add(LSTM(units=16, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    model.summary()
    path_checkpoint = './main_model/sentiment_checkpoint10.keras'
    checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)
    try:
        model.load_weights(path_checkpoint)
    except Exception as e:
        print(e)
        
    earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
    lr_reduction = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.1, min_lr=1e-8, patience=0,
                                       verbose=1)
    callbacks = [earlystopping, checkpoint,lr_reduction]
    model.fit(X_train, y_train,validation_split=0.2, epochs=epochs,
          batch_size=batch_size,
          callbacks=callbacks)
    
    from tensorflow.python.keras.models import save_model
    save_model(model,'./main_model/sentiment_checkpoint10.h5')
    result = model.evaluate(X_test, y_test)
    print(' Accuracy is :{0:.2%}'.format(result[1]))
    return result[1]*100

### 1. 10W 数据的预处理从这里开始

In [19]:
### 1.1 样本乱序
### 对原来的数据样本乱序 ### 
import random
def shuffles(inputs,outputs):
    contents = []
    file1 = open(inputs, 'r', encoding='utf-8') 
    file2 = open(outputs, 'w', encoding='utf-8')
    for line in file1.readlines():
        contents.append(line)
    
    random.shuffle(contents)
    for content in contents:
        file2.write(content)
    
    file1.close()
    file2.close()

path = './data/weibo_senti_100k/weibo_senti_100k.txt'
output = './data/weibo_senti_100k/weibo_senti_100k_random.txt'
#shuffles(path,output)

In [8]:
def re_sub(inputs):
    ## inputs is one list and outputs is also
    text  = inputs
    texts = re.sub("\//@[a-zA-Z\W+]+", "",text)
    texts = re.sub("\@[a-zA-Z\w+]+", "",texts)
    texts = re.sub("[\-\#+\//@.\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、""【】~@#￥%……&*（）]+","",texts)
    texts = re.sub('[A-Za-z0-9\!\%\[\]\,\。\:\::\?\“\”\”“\～+\:?\;;\>>]','',texts)
    texts = re.sub('\：：?','',texts)
    return texts

def get_train_data(train_texts_orig,train_target):
    path = './data/weibo_senti_100k/weibo_senti_100k_random.txt'
    with open(path,'r',encoding = 'utf-8') as file:
        for line in file.readlines():
            items = line.split(',')
            train_texts_orig.append(items[1])

            text = items[1]
            text = re_sub(text)
            if len(text)>LENS and text!='\n':
                if items[0] == '1':
                    train_target.append(1)
                else:
                    train_target.append(0)  ##### train_target 这里一定要存放数字，不能是字符串'1'/'0'
    return train_target,train_texts_orig

def get_train_tokens(train_tokens,train_texts_orig,cn_model):
    for text in train_texts_orig:
        text = re_sub(text)
        if len(text)>LENS and text!='\n' and text!='\t':
            cut = jieba.cut(text)
            cut_list = [i for i in cut]
            for i, word in enumerate(cut_list):
                try:
                    cut_list[i] = cn_model.vocab[word].index
                except KeyError:
                    cut_list[i] = 0
            train_tokens.append(cut_list)
    return train_tokens

### 2.运行主函数

In [9]:
embedding_dim = 300
epochs     = 20      #迭代次数
batch_size = 512*2  #每个batch大小
LENS =1
train_texts_orig = []
train_target     = []
train_tokens     = []
if __name__ == '__main__':
    cn_model                      = get_cn_model()
    train_target,train_texts_orig = get_train_data(train_texts_orig,train_target)
    train_tokens                  = get_train_tokens(train_tokens,train_texts_orig,cn_model)
    #num_words                    = vocab_size(train_tokens) # 这个计算比较花费时间
    num_words                     = 76370
    max_tokens                    = max_token(train_tokens)
    embedding_matrix              = get_embedding_matrix(embedding_dim,num_words,cn_model)
    train_pad,train_targets       = add_padding(train_tokens,train_target,max_tokens,num_words)
    X_train, X_test, y_train, y_test = train_test_split(train_pad, # array
                                                    train_target, # array
                                                    test_size=0.2,
                                                    random_state=660)
    accuracy = model(epochs,batch_size)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\H155809\AppData\Local\Temp\jieba.cache
Loading model cost 0.713 seconds.
Prefix dict has been built succesfully.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 56, 300)           22911000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 128)         186880    
_________________________________________________________________
lstm_2 (LSTM)                (None, 16)                9280      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 23,107,177
Trainable params: 196,177
Non-trainable params: 22,911,000
_________________________________________________________________
Unable to open file (Unable to open file: name = './main_model/sentiment_checkpoint10.keras', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)
Train on 76493 samples, validate on 19124 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20

In [10]:
print("上面模型预测准确率是:%0.2f"%accuracy,'%')

上面模型预测准确率是:95.60 %


In [11]:
train_pad.shape,train_targets.shape  #样本大小 - 大约12w的数据

((119522, 56), (119522,))

### 3. 模型预测新的样例

In [12]:
def get_cn_model(): 
    cn_model = KeyedVectors.load_word2vec_format('embeddings/sgns.zhihu.bigram', 
                                             binary=False, unicode_errors="ignore")
    
    return cn_model
def re_sub(inputs):
    ## inputs is one list and outputs is also
    text  = inputs
    texts = re.sub("\//@[a-zA-Z\W+]+", "",text)
    texts = re.sub("\@[a-zA-Z\w+]+", "",texts)
    texts = re.sub("[\-\#+\//@.\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、""【】~@#￥%……&*（）]+","",texts)
    texts = re.sub('[A-Za-z0-9\!\%\[\]\,\。\:\::\?\“\”\”“\～+\:?\;;\>>]','',texts)
    texts = re.sub('\：：?','',texts)
    return texts

def prediction(text,LENS =1):
    cn_model             = get_cn_model()
    text = re_sub(text)
    if len(text)>LENS and text!='\n' and text!='\t':
        cut_list = [i for i in jieba.cut(text)]
        for i, word in enumerate(cut_list):
            try:
                cut_list[i] = cn_model.vocab[word].index
                if cut_list[i]>=76370:  #nums word
                    cut_list[i] = 0
            except KeyError:
                cut_list[i] = 0
            
    #padding
    max_tokens = 56  #get from above model parameters
    train_pad = pad_sequences([cut_list], maxlen=max_tokens,
                            padding='pre', truncating='pre')
    
    # loading model
    from tensorflow.python.keras.models import load_model
    model_path = './main_model/sentiment_checkpoint10.h5'
    model = load_model(model_path)
    
    result = model.predict(x=train_pad)
    
    coefs = result[0][0]
    return coefs*100  # 返回判断的阈值

In [26]:
text_list= [
    '我今天为什么要穿白长裙？[泪]还有几站该下车了！雨依然哗哗的！[抓狂] //@败家de小妞子:裙子已经湿了[泪]',
    '糟糕透顶了，刚买的新手机就丢厕所里面了,想骂人呀，谁也别招惹我。',
    '今天是个好日子，天气特别美好，我的心情也很好',
    '新发的工资,还意外的领到了红包',
    '朋友出车祸了',
    '今天去出差，住豪华酒店',
    '今天第一次陪朋友去逛街',
    '这不科学……应该是一堆狗~//@蜘蛛3号: //@我的朋友是个呆B:QAQ//@进击的巨人官网:QAQ//@我的同事是个婊子: QAQ//@',
    '我刚问了老公这个问题，宝宝出生后他会说什么，他不加思索地来了一句：“八喜，欢迎来到地球！”[汗][晕]'
]

results = []
for text in text_list:
    res = prediction(text)
    results.append(res)

In [27]:
cte = ['正面情绪','负面情绪']
for i in range(len(text_list)):
    
    if results[i]>50:
        cte_j = 0
    else:
        cte_j = 1
    print("第%d条微博是:%s"%(i+1,text_list[i]) )
    print("模型判断这是一条: ##%s##,预测阈值是:%0.2f"%(cte[cte_j],results[i]) )
    print('\n')

第1条微博是:我今天为什么要穿白长裙？[泪]还有几站该下车了！雨依然哗哗的！[抓狂] //@败家de小妞子:裙子已经湿了[泪]
模型判断这是一条: ##负面情绪##,预测阈值是:0.14


第2条微博是:糟糕透顶了，刚买的新手机就丢厕所里面了,想骂人呀，谁也别招惹我。
模型判断这是一条: ##负面情绪##,预测阈值是:1.03


第3条微博是:今天是个好日子，天气特别美好，我的心情也很好
模型判断这是一条: ##正面情绪##,预测阈值是:59.76


第4条微博是:新发的工资,还意外的领到了红包
模型判断这是一条: ##正面情绪##,预测阈值是:50.34


第5条微博是:朋友出车祸了
模型判断这是一条: ##负面情绪##,预测阈值是:8.78


第6条微博是:今天去出差，住豪华酒店
模型判断这是一条: ##负面情绪##,预测阈值是:3.31


第7条微博是:今天第一次陪朋友去逛街
模型判断这是一条: ##正面情绪##,预测阈值是:55.14


第8条微博是:这不科学……应该是一堆狗~//@蜘蛛3号: //@我的朋友是个呆B:QAQ//@进击的巨人官网:QAQ//@我的同事是个婊子: QAQ//@
模型判断这是一条: ##负面情绪##,预测阈值是:0.84


第9条微博是:我刚问了老公这个问题，宝宝出生后他会说什么，他不加思索地来了一句：“八喜，欢迎来到地球！”[汗][晕]
模型判断这是一条: ##负面情绪##,预测阈值是:0.14


