In [1]:
import os 
import pandas as pd 
import time 
import datetime 
import logging
import json 
import random
import math

import warnings
warnings.filterwarnings("ignore")

from tqdm.autonotebook import tqdm
from collections import Counter
import gensim
import numpy as np 
import tensorflow as tf 
from collections import Counter

In [2]:
from utils import *

# 定义参数配置类

In [3]:
class Config(dict):
    def __init__(self, path=None):
        super().__init__()
        ## 定义训练参数
        self['num_epochs'] = 5 
        self['evaluateEvery'] = 100 
        self['checkpointEvery'] = 100 
        self['learningRate'] = 0.001 
        
        ## 定义模型参数
        self['embeddingSize'] = 200 
        
        self['hiddenSizes'] = [256, 128]
        self['dropoutProb'] = 0.5 
        self['l2RegLambda'] = 0.0 
        
        ## 定义基础参数
        self['sequenceLength'] = 200 
        self['batch_size'] = 64 
        self['dataSource'] = path
        self['stopWordSource'] = "../data/english"
        self['numClasses'] = 1  
        self['train_size'] = 0.8   # 训练集和测试集比例
        self.threshold = 0.5 
        
        ## 保存模型参数
        self['checkpoint_dir'] = "../model/BiLSTM_Attention/imdb/checkpoint"
        self['summary_dir'] = "../model/BiLSTM_Attention/imdb/summary"
        self['max_to_keep'] = 5

# 定义模型

## 定义模型类

In [12]:
class BiLSTMAttention(BaseModel):
    def __init__(self, config, wordEmbedding):
        super(BiLSTMAttention, self).__init__(config)
        self.wordEmbedding = wordEmbedding
        self.build_model()
        self.init_saver()
    
    def build_model(self):
        # 定义模型的输入
        self.inputX = tf.placeholder(tf.int32, [None, self.config["sequenceLength"]], name="inpuX")
        self.inputY = tf.placeholder(tf.int32, [None], name="inputY")
        
        self.dropoutProb = tf.placeholder(tf.float32, name="dropoutProb")
        
        # 定义L2损失
        l2Loss = tf.constant(0.0)
        # 词嵌入层
        with tf.name_scope("embedding"):
            # 利用预训练的词向量初始化词嵌入矩阵
            self.W = tf.Variable(tf.cast(self.wordEmbedding, dtype=tf.float32, name="word2vec"),
                                name="W")
            # 利用词嵌入矩阵将输入数据中的词转换成词向量，维度[batch_size, sequence_length, embedding_size]
            self.embeddedWords = tf.nn.embedding_lookup(self.W, self.inputX)
        # 定义两层双向LSTM的模型结构
        with tf.name_scope("Bi-LSTM"):
            for idx, hiddenSize in enumerate(self.config["hiddenSizes"]):
                with tf.name_scope(f"Bi-LSTM{idx}"):
                    ## 定义前向LSTM结构
                    lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(
                        num_units=hiddenSize, state_is_tuple=True),
                                                              output_keep_prob=self.dropoutProb)
                    ## 定义后向LSTM结构
                    lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(
                        num_units=hiddenSize, state_is_tuple=True),
                                                              output_keep_prob=self.dropoutProb)
                    
                    ## 采用动态rnn，可以动态输入序列长度
                    ## 输出outputs的形式是[output_fw, output_bw]
                    ## 其中两个元素的维度都是[batch_size, max_time, hidden_size]
                    outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(lstmFwCell,
                                                                                 lstmBwCell,
                                                                                 self.embeddedWords,
                                                                                 dtype=tf.float32,
                                                                                 scope=f"bilstm-{idx}")
                    ## 对outputs的fw和bw的结果拼接 [batch_size, time_step, hidden_size*2]，
                    ## 传入到下一层Bi-Lstm中
                    self.embeddedWords = tf.concat(outputs, 2)
        ## 将最后一层的Bi-LSTm结果分割成前向和后向
        ## 第一个参数是分割的对象，第二个参数是分割之后的数量
        ## 第三个参数是分割的维度
        outputs = tf.split(self.embeddedWords, 2, -1)
        
        # 在Bi-LSTM + Attention论文中，将前向和后向的输出相加
        with tf.name_scope("Attention"):
            ## 这个张量的维度为[batch_size, max_time, hidden_size[-1]]
            H = outputs[0] + outputs[1]
            ## 得到Attention输出
            output = self.attention(H)
            ## 获取最后输出的维度
            outputSize = self.config['hiddenSizes'][-1]
            
        ## 全连接层的输出
        with tf.name_scope("output"):
            self.logits = tf.layers.dense(output, self.config['numClasses'],name="dense",
                                         kernel_initializer=tf.truncated_normal_initializer(stddev=0.1, seed=2019),
                                         bias_initializer=tf.constant_initializer(0.1))
            
            ## 获取该层的权重
            with tf.variable_scope("dense", reuse=True):
                outputW = tf.get_variable("kernel")
            l2Loss += tf.nn.l2_loss(outputW)
        
            if self.config['numClasses'] == 1: 
                self.predictions = tf.sigmod(self.logits)
            elif self.config['numClasses'] > 1: 
                self.predictions = tf.nn.softmax(self.logits, dim=1)
        
        # 计算二元交叉熵损失
        with tf.name_scope("loss"):
            if self.config['numClasses'] == 1: 
                losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
                                                                labels=tf.cast(tf.reshape(self.inputY, [-1, 1]),
                                                                              dtype=tf.float32))
            elif self.config['numClasses'] > 1: 
                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits,
                                                                       labels=self.inputY)
            
            self.loss = tf.reduce_mean(losses) + self.config["l2RegLambda"] * l2Loss
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self.train_op = tf.train.AdamOptimizer(
                    self.config['learningRate']).minimize(self.loss, global_step=self.global_step_tensor)
            
    ## 定义attention函数
    def attention(self, H): 
        '''
        利用Attention机制得到句子的向量表示
        '''
        hiddenSize = self.config["hiddenSizes"][-1]
        ## 初始化一个权重向量，用于和每个时间步的向量做attention
        W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1))
        
        # 对BiLSTM的输出用激活函数做非线性变化
        M = tf.tanh(H)
        # 定义W和M做矩阵运算，M的形状是[batch_size, max_time, hidden_size]
        # W的形状是 [hidden_size]，希望得到输出[batch_size, max_time, 1]
        newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1]))
        
        # 对newM做维度转换变为 [batch_size, max_time]
        restoreM = tf.reshape(newM, [-1, self.config["sequenceLength"]])
        
        # 用softmax归一化处理[batch_size, max_time]
        self.alpha = tf.nn.softmax(restoreM)
        
        # 利用求得的alpha的值对H进行加权求和，用矩阵运算直接操作
        ## 转换后的H维度为[batch_size, hidden_size, max_time]
        r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.expand_dims(self.alpha, 2))
        
        ## 将三维压缩成二维 [batch_size, hidden_size]
        sequeezeR = tf.reshape(r, [-1, hiddenSize])
        sentenceRepren = tf.tanh(sequeezeR)
        
        # 对Attention做dropout处理
        output = tf.nn.dropout(sentenceRepren, self.dropoutProb)
        return output
    
    def init_saver(self):
        '''
        初始化用于保存模型的对象
         '''
        self.saver = tf.train.Saver(max_to_keep=self.config["max_to_keep"])

## 定义训练类

In [13]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super(Trainer, self).__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
    def train_epoch(self):
        num_iter_per_epoch = self.train.length // self.config["batch_size"]
        
        for _ in tqdm(range(num_iter_per_epoch)):
            ## 获取训练过程的结果
            loss, metrics, step = self.train_step()
            train_acc = metrics["accuracy"]
            train_f_score = metrics["f_score"]
            
            ## 将训练过程中的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc), 
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer="train", scope="train_summary",
                                 summaries_dict=summaries_dict)
            
            if step % self.config['evaluateEvery'] == 0:
                print("Train —— Step: {} | Loss: {} | Acc: {} | F1_Score: {}".format(
                    step, loss, train_acc, train_f_score))
                ## 对测试集进行评估
                print("\nEvaluation: \n")
                eval_losses = []
                eval_true = []
                eval_pred = []
                
                for batchEval in self.eval.iter_all(self.config["batch_size"]):
                    loss, precdictions = self.eval_step(batchEval[0], batchEval[1])
                    eval_losses.append(loss)
                    eval_true.extend(batchEval[-1])
                    eval_pred.extend(precdictions)
                getMetric = Metric(np.array(eval_pred), np.array(eval_true), self.config)
                metrics = getMetric.get_metrics()
                
                prec_mean = np.round(metrics['precision'])
                recall_mean = np.round(metrics['recall'])
                loss_mean = np.round(np.mean(eval_losses), 5)
                time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M%S %p")
                
                print("{} | Loss: {} | Precision: {} | Recall: {}".format(time_str,
                                                                         loss_mean,
                                                                         prec_mean,
                                                                         recall_mean))
                
                summaries_dict = {"loss": np.array(loss_mean),
                                 "precision": np.array(prec_mean),
                                 "recall": np.array(recall_mean)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
                
            if step % self.config["checkpointEvery"] == 0: 
                self.model.save(self.sess)
    
    
    def train_step(self):
        batch_x, batch_y = next(self.train.next_batch(self.config["batch_size"]))
        feed_dict = {self.model.inputX: batch_x, 
                    self.model.inputY: batch_y, 
                    self.model.dropoutProb: self.config["dropoutProb"]}
        
        _, loss, predictions, step = self.sess.run([self.model.train_op, 
                                                   self.model.loss, 
                                                   self.model.predictions,
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)
        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
            
        return loss, metrics, step 
    
    def eval_step(self, batch_x, batch_y):
        '''
        使用验证集数据进行测试
        '''
        feed_dict = {self.model.inputX: batch_x, self.model.inputY: batch_y,
                    self.model.dropoutProb: 1.0}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)
            
        return loss, predictions

# 使用数据集进行训练和预测

## 使用IMDB数据集进行训练和预测

In [16]:
def main():
    # 实例化配置参数对象
    ## 指定训练数据的文件名
    path = "../data/imdb/labeldTrain.csv"
    config = Config(path)
    
    create_dirs([config["summary_dir"], config["checkpoint_dir"]])
    data = Dataset(config)
    
    ## 生成训练集数据，第一个参数表示wordembedding文件所在文件夹
    data.dataGen("../data/imdb", prefix="imdb")
    
    train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews, data.evalLabels
    wordEmbedding, labels = data.wordEmbedding, data.labelList
    
    train_data = DataGenerator(train_X, train_y)
    eval_data = DataGenerator(eval_X, eval_y)
    pack_data = [train_data, eval_data]
    
    tf.reset_default_graph()
    ## 设置计算图的配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 
    
    sess = tf.Session(config=session_conf)
    
    ## 创建一个实例
    model = BiLSTMAttention(config, wordEmbedding)
    
    logger = Logger(sess, config)
    
    trainer = Trainer(sess, model, pack_data, config, logger)
    
    trainer.train_all()

In [17]:
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 100 | Loss: 0.4060550034046173 | Acc: 0.85938 | F1_Score: 0.87671

Evaluation: 

2019-08-09 17:0858 PM | Loss: 0.3330399990081787 | Precision: 0.83999 | Recall: 0.89307
Saving model...
Model saved
Train —— Step: 200 | Loss: 0.36316460371017456 | Acc: 0.82812 | F1_Score: 0.8254

Evaluation: 

2019-08-09 17:0944 PM | Loss: 0.3258199989795685 | Precision: 0.81149 | Recall: 0.94902
Saving model...
Model saved
Train —— Step: 300 | Loss: 0.3440439701080322 | Acc: 0.82812 | F1_Score: 0.84507

Evaluation: 

2019-08-09 17:1026 PM | Loss: 0.3162600100040436 | Precision: 0.82799 | Recall: 0.94431
Saving model...
Model saved


当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 400 | Loss: 0.20181700587272644 | Acc: 0.95312 | F1_Score: 0.94737

Evaluation: 

2019-08-09 17:1105 PM | Loss: 0.2894200086593628 | Precision: 0.91463 | Recall: 0.83423
Saving model...
Model saved
Train —— Step: 500 | Loss: 0.1569727063179016 | Acc: 0.95312 | F1_Score: 0.94737

Evaluation: 

2019-08-09 17:1145 PM | Loss: 0.36614999175071716 | Precision: 0.84197 | Recall: 0.93512
Saving model...
Model saved
Train —— Step: 600 | Loss: 0.29701024293899536 | Acc: 0.875 | F1_Score: 0.85185

Evaluation: 

2019-08-09 17:1223 PM | Loss: 0.3371700048446655 | Precision: 0.91406 | Recall: 0.8234
Saving model...
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Model saved


当前正处于第3次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 700 | Loss: 0.15801744163036346 | Acc: 0.95312 | F1_Score: 0.94545

Evaluation: 

2019-08-09 17:1302 PM | Loss: 0.3246600031852722 | Precision: 0.85958 | Recall: 0.91539
Saving model...
Model saved
Train —— Step: 800 | Loss: 0.1503821313381195 | Acc: 0.96875 | F1_Score: 0.97059

Evaluation: 

2019-08-09 17:1341 PM | Loss: 0.3363800048828125 | Precision: 0.88679 | Recall: 0.88327
Saving model...
Model saved
Train —— Step: 900 | Loss: 0.14376845955848694 | Acc: 0.95312 | F1_Score: 0.95652

Evaluation: 

2019-08-09 17:1419 PM | Loss: 0.4718399941921234 | Precision: 0.93944 | Recall: 0.74085
Saving model...
Model saved


当前正处于第4次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1000 | Loss: 0.04328688234090805 | Acc: 0.98438 | F1_Score: 0.97872

Evaluation: 

2019-08-09 17:1458 PM | Loss: 0.4139600098133087 | Precision: 0.90191 | Recall: 0.84872
Saving model...
Model saved
Train —— Step: 1100 | Loss: 0.05956492945551872 | Acc: 0.96875 | F1_Score: 0.96429

Evaluation: 

2019-08-09 17:1537 PM | Loss: 0.3738499879837036 | Precision: 0.89881 | Recall: 0.85989
Saving model...
Model saved
Train —— Step: 1200 | Loss: 0.05035770684480667 | Acc: 0.98438 | F1_Score: 0.98734

Evaluation: 

2019-08-09 17:1617 PM | Loss: 0.3781299889087677 | Precision: 0.88545 | Recall: 0.87843
Saving model...
Model saved


当前正处于第5次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1300 | Loss: 0.02755913697183132 | Acc: 1.0 | F1_Score: 1.0

Evaluation: 

2019-08-09 17:1657 PM | Loss: 0.4448400139808655 | Precision: 0.89958 | Recall: 0.84814
Saving model...
Model saved
Train —— Step: 1400 | Loss: 0.06449589133262634 | Acc: 0.96875 | F1_Score: 0.95833

Evaluation: 

2019-08-09 17:1737 PM | Loss: 0.44767001271247864 | Precision: 0.86049 | Recall: 0.91544
Saving model...
Model saved
Train —— Step: 1500 | Loss: 0.03325489163398743 | Acc: 0.98438 | F1_Score: 0.98508

Evaluation: 

2019-08-09 17:1816 PM | Loss: 0.4826500117778778 | Precision: 0.87725 | Recall: 0.88369
Saving model...
Model saved



- 最终结果 —— Precision: 0.87725, Recall: 0.88369

## 使用Yelps数据集训练

In [22]:
def main():
    path = "../data/yelps/yelps_test.csv"
    config = Config(path)
    config["summary_dir"] = "../model/BiLSTM_Attention/yelps/summary"
    config["checkpoint_dir"] = "../model/BiLSTM_Attention/yelps/checkpoint"
    config['evaluateEvery'] = 2000 
    config['checkpointEvery'] = 2000 
    
    
    create_dirs([config["summary_dir"], config["checkpoint_dir"]])
    
    data = Dataset(config)
    
    ## 生成训练集数据
    data.dataGen("../data/yelps/", prefix="yelps")
    
    train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews, data.evalLabels
    wordEmbedding, labels = data.wordEmbedding, data.labelList
    
    train_data = DataGenerator(train_X, train_y)
    eval_data = DataGenerator(eval_X, eval_y)
    pack_data = [train_data, eval_data]
    
    tf.reset_default_graph()
    
    # 设置计算图的配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, 
                                 log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 
    sess = tf.Session(config=session_conf)
    
    ## 创建一个实例
    model = BiLSTMAttention(config, wordEmbedding)
    
    logger = Logger(sess, config)
    trainer = Trainer(sess, model, pack_data, config, logger)
    
    trainer.train_all()

In [23]:
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=8939), HTML(value='')))

Train —— Step: 2000 | Loss: 0.3246549963951111 | Acc: 0.84375 | F1_Score: 0.89362

Evaluation: 

2019-08-09 17:5334 PM | Loss: 0.22502000629901886 | Precision: 0.91156 | Recall: 0.95389
Saving model...
Model saved
Train —— Step: 4000 | Loss: 0.1653369516134262 | Acc: 0.95312 | F1_Score: 0.96703

Evaluation: 

2019-08-09 18:0850 PM | Loss: 0.22026999294757843 | Precision: 0.92169 | Recall: 0.94862
Saving model...
Model saved
Train —— Step: 6000 | Loss: 0.129671111702919 | Acc: 0.95312 | F1_Score: 0.96386

Evaluation: 

2019-08-09 18:2351 PM | Loss: 0.21250000596046448 | Precision: 0.92785 | Recall: 0.94586
Saving model...
Model saved
Train —— Step: 8000 | Loss: 0.19084835052490234 | Acc: 0.90625 | F1_Score: 0.925

Evaluation: 

2019-08-09 18:3850 PM | Loss: 0.21105000376701355 | Precision: 0.92301 | Recall: 0.95253
Saving model...
Model saved

当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=8939), HTML(value='')))

Train —— Step: 10000 | Loss: 0.15338148176670074 | Acc: 0.92188 | F1_Score: 0.93976

Evaluation: 

2019-08-09 18:5353 PM | Loss: 0.2106499969959259 | Precision: 0.93676 | Recall: 0.93668
Saving model...
Model saved
Train —— Step: 12000 | Loss: 0.10625214874744415 | Acc: 0.96875 | F1_Score: 0.97727

Evaluation: 

2019-08-09 19:0903 PM | Loss: 0.2207300066947937 | Precision: 0.93263 | Recall: 0.93988
Saving model...
Model saved
Train —— Step: 14000 | Loss: 0.22055131196975708 | Acc: 0.92188 | F1_Score: 0.93827

Evaluation: 

2019-08-09 19:2407 PM | Loss: 0.2228900045156479 | Precision: 0.92721 | Recall: 0.9465
Saving model...
Model saved
Train —— Step: 16000 | Loss: 0.13444609940052032 | Acc: 0.90625 | F1_Score: 0.92857

Evaluation: 

2019-08-09 19:3847 PM | Loss: 0.2180500030517578 | Precision: 0.91951 | Recall: 0.95648
Saving model...
Model saved

当前正处于第3次迭代


HBox(children=(IntProgress(value=0, max=8939), HTML(value='')))

KeyboardInterrupt: 

- 最佳结果 —— Precision：0.93676  Recall: 0.93668