In [44]:
### 在10w中文微博数据上实现情感的二分类任务
###  2019 - 11-19

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re
import jieba # 结巴分词
# gensim用来加载预训练word vector
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")
# 用来解压
import bz2
# 我们使用tensorflow的keras接口来建模
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

In [3]:
def get_cn_model(): 
    cn_model = KeyedVectors.load_word2vec_format('embeddings/sgns.zhihu.bigram', 
                                             binary=False, unicode_errors="ignore")
    
    return cn_model

In [4]:
def max_token(train_tokens):
    num_tokens = [ len(tokens) for tokens in train_tokens ]
    num_tokens = np.array(num_tokens)
    max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
    max_tokens = int(max_tokens)
    return max_tokens

In [5]:
def vocab_size(train_tokens):
    vocab_tokens = []
    i = 0
    for tokens in train_tokens:
        for token in tokens:
            i+=1
            if token not in vocab_tokens:
                vocab_tokens.append(token)
            else:
                pass
    if i%1e6 == 0:
        print("处理前%d的词花费的时间是:%0.2f"%(i,(time.time() - t1)/60),'mins')
    print("词汇表大小是：",len(vocab_tokens))
    return len(vocab_tokens)

In [6]:
def get_embedding_matrix(embedding_dim,num_words,cn_model):
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for i in range(num_words):
        embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]
    embedding_matrix = embedding_matrix.astype('float32')
    return embedding_matrix

In [7]:
def add_padding(train_tokens,train_target,max_tokens,num_words):
    train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
                            padding='pre', truncating='pre')
    train_pad[ train_pad>=num_words ] = 0
    train_targets = np.array(train_target)
    return train_pad,train_targets

In [8]:
def model(epochs,batch_size):
    model = Sequential()
    model.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_tokens,
                    trainable=False))
    model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
    model.add(LSTM(units=16, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    model.summary()
    path_checkpoint = './main_model/sentiment_checkpoint.keras'
    checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)
    try:
        model.load_weights(path_checkpoint)
    except Exception as e:
        print(e)
        
    earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
    lr_reduction = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.1, min_lr=1e-8, patience=0,
                                       verbose=1)
    callbacks = [earlystopping, checkpoint,lr_reduction]
    model.fit(X_train, y_train,validation_split=0.1, epochs=epochs,
          batch_size=batch_size,
          callbacks=callbacks)
    result = model.evaluate(X_test, y_test)
    print('Accuracy is :{0:.2%}'.format(result[1]))
    return result[1]*100

### 10W 数据开始

In [10]:
def re_sub(inputs):
    ## inputs is one list and outputs is also
    text  = inputs
    texts = re.sub("\//@[a-zA-Z\W+]+", "",text)
    texts = re.sub("\@[a-zA-Z\w+]+", "",texts)
    texts = re.sub("[\-\#+\//@.\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、""【】~@#￥%……&*（）]+","",texts)
    texts = re.sub('[A-Za-z0-9\!\%\[\]\,\。\:\::\?\“\”\”“\～+\:?\;;\>>]','',texts)
    texts = re.sub('\：：?','',texts)
    return texts

def get_train_data(train_texts_orig,train_target):
    path = './git下载的数据集合/weibo_senti_100k/weibo_senti_100k.txt'
    with open(path,'r',encoding = 'utf-8') as file:
        for line in file.readlines():
            items = line.split(',')
            train_texts_orig.append(items[1])

            text = items[1]
            text = re_sub(text)
            if len(text)>LENS and text!='\n':
                if items[0] == '1':
                    train_target.append(1)
                else:
                    train_target.append(0)  ##### train_target 这里一定要存放数字，不能是字符串'1'/'0'
    return train_target,train_texts_orig

def get_train_tokens(train_tokens,train_texts_orig,cn_model):
    for text in train_texts_orig:
        text = re_sub(text)
        if len(text)>LENS and text!='\n' and text!='\t':
            cut = jieba.cut(text)
            cut_list = [i for i in cut]
            for i, word in enumerate(cut_list):
                try:
                    cut_list[i] = cn_model.vocab[word].index
                except KeyError:
                    cut_list[i] = 0
            train_tokens.append(cut_list)
    return train_tokens

In [11]:
LENS =1
train_texts_orig = []
train_tokens =[]
cn_model = get_cn_model()
train_target,train_texts_orig = get_train_data(train_texts_orig,train_target)
train_tokens = get_train_tokens(train_tokens,train_texts_orig,cn_model)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\H155809\AppData\Local\Temp\jieba.cache
Loading model cost 0.720 seconds.
Prefix dict has been built succesfully.


In [42]:
import random
nums  = random.randint(0,100)
train_texts_orig[nums],train_target[nums],train_tokens[nums]

('咱们结伴去布达佩斯当土豪吧！[嘻嘻]先备个攻略>>http://t.cn/zRoKlPj\n',
 1,
 [10682, 90, 1310, 37345, 1862, 3538, 610])

In [12]:
embedding_dim = 300
epochs     = 6      #迭代次数
batch_size = 512*2  #每个batch大小
LENS =1
train_texts_orig = []
train_target     = []
train_tokens     = []
if __name__ == '__main__':
    cn_model                      = get_cn_model()
    train_target,train_texts_orig = get_train_data(train_texts_orig,train_target)
    train_tokens                  = get_train_tokens(train_tokens,train_texts_orig,cn_model)
    #num_words                    = vocab_size(train_tokens) # 这个计算比较花费时间
    num_words                     = 76370
    max_tokens                    = max_token(train_tokens)
    embedding_matrix              = get_embedding_matrix(embedding_dim,num_words,cn_model)
    train_pad,train_targets       = add_padding(train_tokens,train_target,max_tokens,num_words)
    X_train, X_test, y_train, y_test = train_test_split(train_pad, # array
                                                    train_target, # array
                                                    test_size=0.1,
                                                    random_state=12)
    accuracy = model(epochs,batch_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 56, 300)           22911000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 128)         186880    
_________________________________________________________________
lstm_2 (LSTM)                (None, 16)                9280      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 23,107,177
Trainable params: 196,177
Non-trainable params: 22,911,000
_________________________________________________________________
Train on 96812 samples, validate on 10757 samples
Epoch 1/6
Epoch 2/6

Epoch 00001: reducing learning rate to 0.00010000000474974513.
Epoch 3/6

Epoch 00002: reducing learning rate to 1.0000000474974514e-05.
Epoch 4/6

Epoch 00003: reducing learning rate t

In [13]:
print("上面模型预测准确率是:%0.2f"%accuracy,'%')

上面模型预测准确率是:95.94 %


In [14]:
train_pad.shape,train_targets.shape

((119522, 56), (119522,))