In [2]:
import os 
import time 
import datetime 
import json 
import math 
import logging 
import random
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np 
from tqdm.autonotebook import tqdm
from collections import Counter
import gensim 
import tensorflow as tf 


In [1]:
from utils import *

# 定义配置类

## 基本配置类

In [3]:
class Config(dict):
    def __init__(self, path=None):
        super().__init__()
        ## 定义训练参数
        self['num_epochs'] = 5 
        self['evaluateEvery'] = 100 
        self['checkpointEvery'] = 100 
        self['learningRate'] = 0.001 
        
        ## 定义模型参数
        self['embeddingSize'] = 200 
        self['hiddenSizes'] = 128   ## LSTM网络神经元个数
        self['dropoutProb'] = 0.5  
        self['l2RegLambda'] = 0.0 
        self['epsilon'] = 5 
        
        ## 定义基础参数
        self['sequenceLength'] = 200 
        self['batch_size'] = 64 
        self['dataSource'] = path 
        self['stopWordSource'] = "../data/english"
        self['numClasses'] = 1 
        self['train_size'] = 0.8   ## 训练集和测试集的比例
        self.threshold = 0.5 
        
        ## 保存模型参数
        self['checkpoint_dir'] = "../model/AdversarialLSTM/imdb/checkpoint"
        self['summary_dir'] = "../model/AdversarialLSTM/imdb/summary"
        self['max_to_keep'] = 5 

## 重写数据加载类

In [20]:
class Dataloader(Dataset):
    def __init__(self, config):
        super(Dataloader, self).__init__(config)
        
        self.indexFreqs = []    # 统计词空间中每个词出现在多少个不同文档中
        
    
    def _genVocabulary(self, reviews, labels, path, prefix=""):
        '''
        生成向量和词汇-索引映射字典
        '''

        save_path = "../data/wordJson"
        target_word_dir = os.path.join(save_path, prefix + "_word2idx.json")
        target_label_dir = os.path.join(save_path, prefix + "_label2idx.json")

  
        allWords = [word for review in reviews for word in review]
        # 去掉停用词
        subWords = [word for word in allWords if word not in self.stopWordDict]
        wordCount = Counter(subWords)  # 统计各个词的词频
        sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)

        # 去除低频词
        words = [item[0] for item in sortWordCount if item[1] >= 5]

        vocab, wordEmbedding = self._getWordEmbedding(words, path)
        self.wordEmbedding = wordEmbedding

        # print(len(vocab), vocab[10])
        word2idx = dict(zip(vocab, range(len(vocab))))

        ##------------------------------------------------
        # 得到逆词频
        self._getWordIndexFreq(vocab, reviews, word2idx)
        ##------------------------------------------------

        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))

        # 将词汇表-索引映射表保存为json数据，之后inference时直接加载处理数据

        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with open(target_word_dir, "w", encoding="utf8") as f:
            json.dump(word2idx, f)

        with open(target_label_dir, "w", encoding="utf8") as f:
            json.dump(label2idx, f)

        return word2idx, label2idx
        
        
    def _getWordIndexFreq(self, vocab, reviews, word2idx):
        '''
        统计词汇空间中每个词出现在多少个不同的文本中
        '''
        print("正在计算逆词频...")
        indexFreqs = [0] * len(vocab)
        for word in tqdm(vocab):
            count = 0 
            for review in reviews:
                if word in set(review):
                    count += 1 
            indexFreqs[word2idx[word]] = count
        
        print("逆词频计算结束...")
        self.indexFreqs = indexFreqs
        

# 定义模型

## 定义模型结构

In [77]:
class AdversarialLSTM(BaseModel):
    def __init__(self, config, wordEmbedding, indexFreqs):
        super(AdversarialLSTM, self).__init__(config)
        self.wordEmbedding = wordEmbedding
        # 表示每个单词的逆词频
        ## 第一个词表示PAD，第二个词表示UNK，需要赋默认值
        indexFreqs[0], indexFreqs[1] = 20000, 10000 
        self.indexFreqs = indexFreqs
        ## 根据逆词频计算权重
        self.wordWeights = tf.cast(tf.reshape(indexFreqs/tf.reduce_sum(indexFreqs),
                                             [1, len(indexFreqs)]),
                                  dtype=tf.float32)
        
        
        self.build_model()
        self.init_saver()
        
    def build_model(self):
        # 定义模型输入
        self.inputX = tf.placeholder(tf.int32, [None, self.config['sequenceLength']],
                                    name="inputX")
        self.inputY = tf.placeholder(tf.int32, [None], name="inputY")
        
        self.dropoutProb = tf.placeholder(tf.float32, name="dropoutProb")
        
        # 词嵌入层
        with tf.name_scope("embedding"):
            ## 利用词频计算新的词嵌入矩阵
            normWordEmbedding = self._normalize(tf.cast(self.wordEmbedding, 
                                                       dtype=tf.float32, 
                                                       name="word2vec"), self.wordWeights)
            ## 利用词嵌入矩阵将输入数据中的词转换为词向量，[batch_size, sequence_length, embed_size]
            self.embeddedWords = tf.nn.embedding_lookup(normWordEmbedding, self.inputX)
            
            
        # 计算二元交叉熵损失
        with tf.name_scope("loss"):
            with tf.variable_scope("Bi-LSTM", reuse=None):
                self.logits = self._Bi_LSTMAttention(self.embeddedWords)
                
                if self.config['numClasses'] == 1: 
                    self.predictions = tf.nn.sigmoid(self.logits)
                    losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
                                                                    labels=tf.cast(
                                                                        tf.reshape(self.inputY, [-1, 1]),
                                                                        dtype=tf.float32))
                elif self.config['numClasses'] > 1: 
                    self.predictions = tf.nn.softmax(self.logits, dim=1)
                    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits,
                                                                           labels=self.inputY)
                loss = tf.reduce_mean(losses)
                
        with tf.name_scope("perturLoss"):
            with tf.variable_scope("Bi-LSTM", reuse=True):
                perturWordEmbedding = self._addPerturbation(self.embeddedWords, loss)
                perturPredictions = self._Bi_LSTMAttention(perturWordEmbedding)
                perturLosses = tf.nn.sigmoid_cross_entropy_with_logits(logits=perturPredictions,
                                                                      labels=tf.cast(tf.reshape(self.inputY, [-1, 1]),
                                                                                    dtype=tf.float32))
                perturLoss = tf.reduce_mean(perturLosses)
                
        self.loss = loss + perturLoss
        
        # 对所有节点进行更新
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            self.train_op = tf.train.AdamOptimizer(
                    self.config["learningRate"]).minimize(self.loss, 
                                                          global_step=self.global_step_tensor)
        
        
    def _Bi_LSTMAttention(self, embeddedWords):
        '''
        Bi-LSTM + Attention结构
        '''
        
        # 定义双向的LSTM
        with tf.name_scope("Bi-LSTM"):
            ## 定义前向的LSTM结构
            lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=self.config['hiddenSizes'],
                                                                              state_is_tuple=True),
                                                      output_keep_prob=self.dropoutProb)
            lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=self.config['hiddenSizes'],
                                                                              state_is_tuple=True),
                                                      output_keep_prob=self.dropoutProb)
            
            ## 采用动态RNN，可以动态的输入序列的长度，没有输入则取序列全场
            outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(lstmFwCell,
                                                                         lstmBwCell,
                                                                         embeddedWords,
                                                                         dtype=tf.float32, 
                                                                         scope="bi-lstm")
        
        # 将前向和后向的输出相加
        with tf.name_scope("Attention"):
            ## [batch, max_time, hidden_size]
            H = outputs[0] + outputs[1]
            ## 得到Attention的输出
            output = self._attention(H)
           
        # 全连接层输出
        with tf.name_scope("output"):
            pred = tf.layers.dense(output, self.config['numClasses'],name="dense",
                                         kernel_initializer=tf.truncated_normal_initializer(stddev=0.1, seed=2019),
                                         bias_initializer=tf.constant_initializer(0.1))
        
        return pred
    
    def _attention(self, H):
        '''
        利用Attention机制得到句子的向量表示
        '''
        # 获得最后一层LSTM的神经元数量
        hiddenSize = self.config["hiddenSizes"]
        
        # 初始化一个查询向量query
        W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1))
        # 对Bi-LSTM的输出用激活函数做非线性变换
        M = tf.tanh(H) 
        
        # 对M和W做矩阵运算，得到每一个时间步的权重，newM的大小 [batch_size, time_step, 1]
        newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1]))
        
        # 定义newM做维度转换 [batch_size, time_step]
        restoreM = tf.reshape(newM, [-1, self.config['sequenceLength']])
     
        # 对权重进行归一化处理
        self.alpha = tf.nn.softmax(restoreM)
        
        # 利用求得的alpha的值对H进行加权求和
        r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, 
                                                             [-1, self.config["sequenceLength"], 1]))
        # 将三维压缩成二维 [batch_size, hidden_size]
        sequeezeR = tf.squeeze(r, axis=2)
        sentenceRepren = tf.tanh(sequeezeR)
        
        # 对Attention的输出可以做dropout处理
        output = tf.nn.dropout(sentenceRepren, self.dropoutProb)
        
        return output
    
    def _normalize(self, wordEmbedding, weights):
        '''
        对word embedding结合权重做标准化处理
        '''
        # 对所有词向量求加权均值
        mean = tf.matmul(weights, wordEmbedding)
        powWordEmbedding = tf.pow(wordEmbedding - mean, 2)
        
        var = tf.matmul(weights, powWordEmbedding)
        stddev = tf.sqrt(1e-6+var)
        
        return (wordEmbedding - mean) / stddev
    
    def _addPerturbation(self, embeddedWords, loss):
        '''
        对此向量添加波动
        embeddedWords: 这里表示加权后的词向量, [batch, max_time, embed_size]
        '''
        grad, = tf.gradients(loss, embeddedWords, 
                            aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
        ## 返回值和原值相等，只不过截断向前的梯度
        grad = tf.stop_gradient(grad)
        perturb = self._scaleL2(grad, self.config['epsilon'])
        return embeddedWords + perturb
    
    def _scaleL2(self, x, norm_length):
        '''
        x中每个batch的元素都除以这个batch中经过l2 not稳定之后的最大值
        l2norm(x) = a * l2norm(x/a)
        x: 大小为[batch, max_time, embed_size]
        '''
        alpha = tf.reduce_max(tf.abs(x), (1, 2), keepdims=True) + 1e-12
        l2_norm = alpha * tf.sqrt(tf.reduce_sum(tf.pow(x/alpha, 2), (1, 2), 
                                               keepdims=True)+1e-6)
        x_unit = x / l2_norm
        return norm_length * x_unit
    
    def init_saver(self):
        '''
        初始化用于保存模型的对象
        '''
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

## 定义训练类

In [25]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super(Trainer, self).__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
    def train_epoch(self):
        num_iter_per_epoch = self.train.length // self.config['batch_size']
        
        for _ in tqdm(range(num_iter_per_epoch)):
            ## 获取训练过程的结果
            loss, metrics, step = self.train_step()
            train_acc = metrics["accuracy"]
            train_f_score = metrics["f_score"]
            
            ## 将训练过程中的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc), 
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer="train", scope="train_summary",
                                 summaries_dict=summaries_dict)
            
            if step % self.config['evaluateEvery'] == 0:
                print("Train —— Step: {} | Loss: {} | Acc: {} | F1_Score: {}".format(
                    step, loss, train_acc, train_f_score))
                ## 对测试集进行评估
                print("\nEvaluation: \n")
                eval_losses = []
                eval_true = []
                eval_pred = []
                
                for batchEval in self.eval.iter_all(self.config["batch_size"]):
                    loss, predictions = self.eval_step(batchEval[0], batchEval[1])
                    eval_losses.append(loss)
                    eval_true.extend(batchEval[-1])
                    eval_pred.extend(predictions)
                
                getMetric = Metric(np.array(eval_pred), np.array(eval_true), self.config)
                metrics = getMetric.get_metrics()
                loss_mean = np.round(np.mean(eval_losses), 5)
                prec_mean = np.round(metrics["precision"])
                recall_mean = np.round(metrics["recall"])
                time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M%S %p")
                
                print("{} | Loss: {} | Precision: {} | Recall: {}".format(time_str,
                                                                         loss_mean,
                                                                         prec_mean,
                                                                         recall_mean))
                
                summaries_dict = {"loss": np.array(loss_mean),
                                 "precision": np.array(prec_mean),
                                 "recall": np.array(recall_mean)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
                
            if step % self.config["checkpointEvery"] == 0: 
                self.model.save(self.sess)
        
    def train_step(self):
        batch_x, batch_y = next(self.train.next_batch(self.config['batch_size']))
        feed_dict = {self.model.inputX: batch_x, 
                    self.model.inputY: batch_y, 
                    self.model.dropoutProb: self.config['dropoutProb']}
        
        _, loss, predictions, step = self.sess.run([self.model.train_op,
                                                   self.model.loss, 
                                                   self.model.predictions,
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)
        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
            
        return loss, metrics, step
    
    def eval_step(self, batch_x, batch_y):
        '''
        使用验证集数据进行测试
        '''
        feed_dict = {self.model.inputX: batch_x,
                    self.model.inputY: batch_y,
                    self.model.dropoutProb: 1.0}
        loss, predictions = self.sess.run([self.model.loss,  self.model.predictions],
                                         feed_dict=feed_dict)
        
        return loss, predictions

# 使用数据集进行训练和预测

## 使用IMDB数据集进行训练和预测

In [21]:
# 实例化配置参数，指定训练数据的文件名
path = "../data/imdb/labeldTrain.csv"
config = Config(path)
create_dirs([config['summary_dir'], config['checkpoint_dir']])

data = Dataloader(config)
# 生成训练数据，第一个参数表示wordEmbedding文件所在的文件夹
data.dataGen("../data/imdb/", prefix="imdb")

train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews, data.evalLabels
wordEmbedding, labels = data.wordEmbedding, data.labelList
indexFreqs = data.indexFreqs

train_data = DataGenerator(train_X, train_y)
eval_data = DataGenerator(eval_X, eval_y)
pack_data = [train_data, eval_data]

正在计算逆词频...


HBox(children=(IntProgress(value=0, max=26679), HTML(value='')))


逆词频计算结束...


In [78]:
def main():
    tf.reset_default_graph()
    # 设置计算图配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 
    
    sess = tf.Session(config=session_conf)
    
    ## 创建一个实例
    model = AdversarialLSTM(config, wordEmbedding, indexFreqs)
    
    logger = Logger(sess, config)
    
    trainer = Trainer(sess, model, pack_data, config, logger)
    trainer.train_all()

In [79]:
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 100 | Loss: 0.8709119558334351 | Acc: 0.875 | F1_Score: 0.86207

Evaluation: 

2019-08-14 19:3506 PM | Loss: 0.9097700119018555 | Precision: 0.8886 | Recall: 0.8294
Saving model...
Model saved
Train —— Step: 200 | Loss: 0.4036068916320801 | Acc: 0.95312 | F1_Score: 0.95774

Evaluation: 

2019-08-14 19:3555 PM | Loss: 0.5567100048065186 | Precision: 0.86655 | Recall: 0.88886
Saving model...
Model saved
Train —— Step: 300 | Loss: 0.5169576406478882 | Acc: 0.84375 | F1_Score: 0.83871

Evaluation: 

2019-08-14 19:3643 PM | Loss: 0.41095998883247375 | Precision: 0.88746 | Recall: 0.86877
Saving model...
Model saved


当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 400 | Loss: 0.24110829830169678 | Acc: 0.90625 | F1_Score: 0.9

Evaluation: 

2019-08-14 19:3731 PM | Loss: 0.37959998846054077 | Precision: 0.87972 | Recall: 0.88928
Saving model...
Model saved
Train —— Step: 500 | Loss: 0.346789687871933 | Acc: 0.90625 | F1_Score: 0.91176

Evaluation: 

2019-08-14 19:3819 PM | Loss: 0.40165001153945923 | Precision: 0.92577 | Recall: 0.80821
Saving model...
Model saved
Train —— Step: 600 | Loss: 0.1802903711795807 | Acc: 0.96875 | F1_Score: 0.97222

Evaluation: 

2019-08-14 19:3906 PM | Loss: 0.3541400134563446 | Precision: 0.89149 | Recall: 0.87223
Saving model...
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Model saved


当前正处于第3次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 700 | Loss: 0.14265277981758118 | Acc: 0.95312 | F1_Score: 0.95238

Evaluation: 

2019-08-14 19:3954 PM | Loss: 0.35947999358177185 | Precision: 0.8829 | Recall: 0.89023
Saving model...
Model saved
Train —— Step: 800 | Loss: 0.3021504878997803 | Acc: 0.92188 | F1_Score: 0.92754

Evaluation: 

2019-08-14 19:4042 PM | Loss: 0.36469998955726624 | Precision: 0.88327 | Recall: 0.89741
Saving model...
Model saved
Train —— Step: 900 | Loss: 0.18523581326007843 | Acc: 0.9375 | F1_Score: 0.9375

Evaluation: 

2019-08-14 19:4129 PM | Loss: 0.3753899931907654 | Precision: 0.89133 | Recall: 0.87786
Saving model...
Model saved


当前正处于第4次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1000 | Loss: 0.3152977228164673 | Acc: 0.89062 | F1_Score: 0.87273

Evaluation: 

2019-08-14 19:4217 PM | Loss: 0.4073899984359741 | Precision: 0.8881 | Recall: 0.88833
Saving model...
Model saved
Train —— Step: 1100 | Loss: 0.24868708848953247 | Acc: 0.9375 | F1_Score: 0.92

Evaluation: 

2019-08-14 19:4304 PM | Loss: 0.4018099904060364 | Precision: 0.90764 | Recall: 0.83727
Saving model...
Model saved
Train —— Step: 1200 | Loss: 0.2673550248146057 | Acc: 0.90625 | F1_Score: 0.90625

Evaluation: 

2019-08-14 19:4352 PM | Loss: 0.42267999053001404 | Precision: 0.90059 | Recall: 0.8507
Saving model...
Model saved


当前正处于第5次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1300 | Loss: 0.2000146508216858 | Acc: 0.96875 | F1_Score: 0.96774

Evaluation: 

2019-08-14 19:4440 PM | Loss: 0.4203700125217438 | Precision: 0.87252 | Recall: 0.89767
Saving model...
Model saved
Train —— Step: 1400 | Loss: 0.24702349305152893 | Acc: 0.95312 | F1_Score: 0.95082

Evaluation: 

2019-08-14 19:4527 PM | Loss: 0.4314900040626526 | Precision: 0.88518 | Recall: 0.86626
Saving model...
Model saved
Train —— Step: 1500 | Loss: 0.18581444025039673 | Acc: 0.95312 | F1_Score: 0.94915

Evaluation: 

2019-08-14 19:4615 PM | Loss: 0.445609986782074 | Precision: 0.88772 | Recall: 0.87582
Saving model...
Model saved

