In [1]:
import numpy as np
import pandas as pd 
import json 

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf 
from collections import Counter
from tqdm.autonotebook import tqdm 

In [2]:
from utils import *

# 1. 定义基础的配置类

In [31]:
class Config(dict):
    def __init__(self, path=None):
        super().__init__()
        
        # 定义训练参数
        self['num_epochs'] = 2 
        self['batch_size'] = 64
        self['sequenceLength'] = 200
        self['evaluateEvery'] = 100 
        self['checkpointEvery'] = 100 
        
        # 学习率衰减
        self['learningRate'] = 0.01 
        self['decay_steps'] = 100   # 学习率每隔多少个step衰减一次
        self['decay_rate'] = 0.9    # 学习率每次衰减的比例
        self['grad_clip'] = 4.0     # 梯度削减的系数
        
        # 定义模型参数
        self['embeddingSize'] = 200 
        self["filters"] = 128     # 内层一维卷积核的数量，外层卷积核的数量要等于embeddingSize，因为要shorcut
        self['numHeads'] = 8      # Attention中heads的数量
        self['numBlocks'] = 1     # 设置Transformer中block的数量
        self['epsilon'] =  1e-8   # LayerNorm中最小的除数
        self['attention_keepProb'] = 0.9  # multi-head attention中的dropout
        self['dropoutProb'] = 0.5  # 全连接层的dropout
        self['l2RegLambda'] = 0.000 
        
        # 设置基础参数
        self['dataSource'] = path
        self['stopWordSource'] = "../data/english"
        self['numClasses'] = 1 
        self['train_size'] = 0.8   # 训练集所占的比例
        self.threshold = 0.5 
        
        # 保存模型的参数
        self['checkpoint_dir'] = "../model/Transformer/imdb/checkpoint"
        self['summary_dir'] = "../model/Transformer/imdb/summary"
        self['max_to_keep'] = 5 

# 2. 定义模型类和训练类

## 2.1 定义模型类

In [23]:
def get_siusoid_encoding_table(config, padding_idx=0):
    '''
    n_position: 表示总共的位置的数量，也就是序列的长度，sequenceLength
    d_hid: 表示位置编码的神经元数，和词向量的size相同
    padding_idx：表示pad的索引
    '''
    n_position = config['sequenceLength'] 
    d_hid = config['embeddingSize']
    # 计算某一个位置向量不同位置的值
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2*(hid_idx//2)/d_hid)
    # 获取某个特定位置的词向量角度值
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
    
    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position+1)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # 对应dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # 对应dim 2i+1
    
    if padding_idx is not None:
        sinusoid_table[padding_idx] = 0.
        
    return sinusoid_table

In [24]:
def get_position_input(batch_x, config):
    '''
    batch_x: 形状为[batch, seq_len]
    '''
    # 计算每一个句子的长度，得到维度为1的ndarray表示每个句子实际长度
    actual_len = np.sum(~np.equal(batch_x, 0), axis=1)
    ## 得到每个位置的标号，维度[batch, seq_len]
    position = list(map(lambda l: list(range(1, l+1))+[0]*(config['sequenceLength']-l), actual_len))
    return np.array(position)

In [25]:
class Transfomer(BaseModel):
    def __init__(self, config, wordEmbedding, posEmbedding):
        super().__init__(config)
        self.wordEmbedding = wordEmbedding
        self.posEmbedding = posEmbedding
        self.build_model()
        self.init_saver()

    def build_model(self):
        # 输入层
        self.inputX = tf.placeholder(tf.int32, [None, self.config['sequenceLength']], name="inputX")
        self.inputPos = tf.placeholder(tf.int32, [None, self.config['sequenceLength']], name="inputPos")
        self.inputY = tf.placeholder(tf.float32, [None], name="inputY")

        self.atten_keep_prob = tf.placeholder(tf.float32, name="atten_keep_prob")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="doprout_keep_prob")

        self._init_weights()

        with tf.name_scope("embedding"):
            self.word_embeded = tf.nn.embedding_lookup(self.word_embeddings, self.inputX)
            self.pos_embeded = tf.nn.embedding_lookup(self.pos_embeddings, self.inputPos)
            ## 得到维度 [batch, seq_len, embed_size]
            self.embeded = tf.add(self.word_embeded, self.pos_embeded)

        with tf.name_scope("transformer"):
            ## 对于不同的block
            for i in range(self.config["numBlocks"]):
                with tf.name_scope(f"block_{i}"):
                    # 得到维度 [batch, seq_len, embed_size]
                    multiHeadAtten = self._multiheadAttention(rawKeys=self.inputX,
                                                             queries=self.embeded,
                                                             keys=self.embeded)
                    self.embeded = self._feedForward(multiHeadAtten,
                                                         [self.config['filters'], self.config['embeddingSize']])
            outputs = tf.reshape(self.embeded, 
                                 [-1, self.config['sequenceLength']*self.config['embeddingSize']])
        outputSize = outputs.get_shape().as_list()[-1]

        with tf.name_scope("dropout"):
            outputs = tf.nn.dropout(outputs, keep_prob=self.dropout_keep_prob)

        # 全连接层输出
        with tf.name_scope("output"):
            outputW = tf.get_variable("outputW", 
                                     shape=[outputSize, self.config['numClasses']],
                                     initializer=tf.contrib.layers.xavier_initializer())
            outputB = tf.Variable(tf.constant(0.1, shape=[self.config['numClasses']]), name="outputB")
            l2Loss = tf.nn.l2_loss(outputW)
            self.logits = tf.add(tf.matmul(outputs, outputW), outputB, name="logits")

        if self.config['numClasses'] == 1:
            self.predictions = tf.nn.sigmoid(self.logits)
        elif self.config['numClasses'] > 1:
            self.predictions = tf.nn.softmax(self.logits, axis=-1)

        # 计算损失
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(tf.reshape(self.inputY, [-1, 1]),
                                                                           dtype=tf.float32),
                                                            logits=self.logits)
            self.loss = tf.reduce_mean(losses)
            if self.config['l2RegLambda'] > 0: 
                self.loss += self.config['l2RegLambda'] * l2Loss

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            learning_rate = tf.train.exponential_decay(self.config['learningRate'],
                                                      self.global_step_tensor, 
                                                      self.config['decay_steps'],
                                                      self.config['decay_rate'],
                                                      staircase=True)
            ## 使用梯度削减防止梯度爆炸
            optimizer = tf.train.AdamOptimizer(learning_rate)

            grads_and_vars = optimizer.compute_gradients(self.loss)
            for idx, (grad, var) in enumerate(grads_and_vars):
                if grad is not None:
                    grads_and_vars[idx] = (tf.clip_by_norm(grad, self.config['grad_clip']), var)
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step_tensor)

    def _init_weights(self):
        with tf.name_scope("weights"):
            self.word_embeddings = tf.Variable(tf.cast(self.wordEmbedding, dtype=tf.float32),
                                              name="wordEmbedding", trainable=False)
            self.pos_embeddings = tf.Variable(tf.cast(self.posEmbedding, dtype=tf.float32),
                                             name="posEmbedding", trainable=False)

    def _multiheadAttention(self, rawKeys, queries, keys, numUnits=None, causality=False,
                           scope="multi-headAttention"):
        numHeads = self.config['numHeads']

        ## 如果没有传入多个heads合在一起的神经元数，则直接用emedding_size的数量
        if numUnits is None:
            numUnits = queries.get_shape().as_list()[-1]

        with tf.name_scope(scope):
            ## 将值进行非线性映射，得到多个head的神经元值
            Q = tf.layers.dense(queries, numUnits, activation=tf.nn.relu)
            K = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)
            V = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)

            ## 将数据按照最后一维分割成num_heads个，然后按照第一维拼接
            ##得到新的Q, K, V的维度为 [batch_size*numHeads, seq_len, embed_size/numHeads]
            Q_ = tf.concat(tf.split(Q, numHeads, axis=-1), axis=0)
            K_ = tf.concat(tf.split(K, numHeads, axis=-1), axis=0)
            V_ = tf.concat(tf.split(V, numHeads, axis=-1), axis=0)

            ## 计算query和key之间的点积，得到维度[batch*heads, seq_len, seq_len]
            similary = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))

            ## 对计算的点击进行scaled
            scaledSimilarity = similary / (K_.get_shape().as_list()[-1] ** 0.5)

            ## 对padding进行mask，第二个参数表示每个维度扩充的数量，得到[batch*heads, seq_len]
            keyMasks = tf.tile(rawKeys, [numHeads, 1])
            ## 增加一个维度，并扩充得到结果 [batch*heads, seq_len, seq_len]
            keyMasks = tf.tile(tf.expand_dims(keyMasks, 1), [1, tf.shape(queries)[1], 1])
            ## 生成全1矩阵，维度和scaledSimilarity相同，然后得到负无穷
            paddings = tf.ones_like(scaledSimilarity) * (-np.inf)

            ## tf.where(condition, x, y)，其中condition元素为bool值，对应True用x中元素替换，False用y中元素替换
            ## 也就是说，下面的效果就是等于0的位置用-inf替换，维度为 [batch*heads, seq_len, seq_len]
            maskedSimilarity = tf.where(tf.equal(keyMasks, 0), paddings, scaledSimilarity)

            # 如果是Decoder，需要将当前单词后面的部分mask掉
            if causality:
                ## 得到维度 [seq_len, seq_len]
                diagVals = tf.ones_like(maskedSimilarity[0, :, :])
                ## 生成下三角，维度[seq_len, seq_len]
                tril = tf.linalg.LinearOperatorLowerTriangular(diagVals).to_dense()
                masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(maskedSimilarity)[0], 1, 1])
                paddings = tf.ones_like(masks) * (-np.inf)
                maskedSimilarity = tf.where(tf.equal(masks, 0), paddings, maskedSimilarity)

            # 通过softmax计算加权系数
            weights = tf.nn.softmax(maskedSimilarity)
            ##加权和得到输出
            outputs = tf.matmul(weights, V_)
            ## 将多头Attention计算得到的输出进行维度重组，得到[batch_size, seq_len, embed_size]
            outputs = tf.concat(tf.split(outputs, numHeads, axis=0), axis=2)
            outputs = tf.nn.dropout(outputs, self.atten_keep_prob)

            # 对每个subLayers建立残差连接
            outputs = tf.add(outputs, queries)
            ## layerNormalization
            outputs = self._layerNormalization(outputs)
            return outputs

    def _layerNormalization(self, inputs, scope="layerNorm"):
        with tf.name_scope(scope):
            # 获取输入的维度，[batch, seq_len, embed_size]
            inputsShape = inputs.get_shape()
            paramsShape = inputsShape[-1:]

            ## LayerNormalization考虑在最后一个维度上计算数据的均值和方差
            ## mean和variance的维度都是[batch, seq_len, 1]
            mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
            beta = tf.Variable(tf.zeros(paramsShape))
            gamma = tf.Variable(tf.ones(paramsShape))
            normalized = (inputs - mean) / ((variance + self.config['epsilon'])** 0.5)
            outputs = gamma * normalized + beta
            return outputs

    def _feedForward(self, inputs, filters, scope="feedForward"):
        # 前向传播采用一维卷积神经网络
        with tf.name_scope(scope):
            ## 内层
            params = {"inputs": inputs, "filters": filters[0], "kernel_size": 1,
                     "activation": tf.nn.relu, "use_bias": True}

            outputs = tf.layers.conv1d(**params)

            ## 外层
            params = {"inputs": outputs, "filters": filters[1], "kernel_size": 1,
                     "activation": None, "use_bias": True}
            outputs = tf.layers.conv1d(**params)

            ## 残差连接
            outputs += inputs 

            ## LayerNormalization
            outputs = self._layerNormalization(outputs)

            return outputs
    
    def init_saver(self):
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

## 2.2 定义训练类

In [26]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super().__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
    def train_epoch(self):
        num_iter_per_epoch = self.train.length // self.config['batch_size']
        for i in tqdm(range(num_iter_per_epoch)):
            ## 获取训练结果
            loss, metrics, step = self.train_step()
            train_acc = metrics['accuracy']
            train_f_score = metrics['f_score']
            
            # 将训练过程的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc),
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer="train", scope="train_summary",
                                 summaries_dict=summaries_dict)
            if step % self.config['evaluateEvery'] == 0: 
                print("Train —— Step: {} | Loss: {} | Acc: {} : F1_Score: {}".format(
                    step, loss, train_acc, train_f_score))
                # 对测试集进行评估
                eval_losses = []
                eval_pred = []
                eval_true = []
                for batchEval in self.eval.iter_all(self.config['batch_size']):
                    loss, predictions = self.eval_step(batchEval[0], batchEval[1])
                    eval_losses.append(loss)
                    eval_pred.extend(predictions)
                    eval_true.extend(batchEval[-1])
                getMetric = Metric(np.array(eval_pred), np.array(eval_true),
                                  self.config)
                metrics = getMetric.get_metrics()
                eval_prec = np.round(metrics['precision'], 5)
                eval_recall = np.round(metrics['recall'], 5)
                loss_mean = np.round(np.mean(eval_losses), 5)
                print("Evaluation —— Loss: {} | Precision: {} | Recall: {}".format(
                    loss_mean, eval_prec, eval_recall))
                summaries_dict = {"loss": np.array(loss_mean),
                                 "precision": np.array(eval_prec), 
                                 "recall": np.array(eval_recall)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
            if step % self.config['checkpointEvery'] == 0: 
                self.model.save(self.sess)
            
            
    def train_step(self):
        batch_x, batch_y = next(self.train.next_batch(self.config['batch_size']))
        batch_pos = get_position_input(batch_x, self.config)
        feed_dict = {self.model.inputX: batch_x, 
                    self.model.inputPos: batch_pos,
                    self.model.inputY: batch_y,
                    self.model.dropout_keep_prob: self.config['dropoutProb'],
                    self.model.atten_keep_prob: self.config['attention_keepProb']}
        _, loss, predictions, step = self.sess.run([self.model.train_op,
                                                   self.model.loss,
                                                   self.model.predictions, 
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)
        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
        return loss, metrics, step
    
    def eval_step(self, batch_x, batch_y):
        batch_pos = get_position_input(batch_x, self.config)
        feed_dict = {self.model.inputX: batch_x,
                    self.model.inputPos: batch_pos,
                    self.model.inputY: batch_y,
                    self.model.dropout_keep_prob: 1.0,
                    self.model.atten_keep_prob: 1.0}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)
        return loss, predictions

# 3. 使用数据进行训练

## 3.1 使用IMDB数据集

In [34]:
def main():
    path = "../data/imdb/labeldTrain.csv"
    config = Config(path)
    create_dirs([config["summary_dir"], config['checkpoint_dir']])
    data = Dataset(config)
    
    ## 生成训练集数据，第一个参数表示wordEmbedding文件所在的文件夹
    data.dataGen("../data/imdb", prefix="imdb")
    
    train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews,data.evalLabels
    wordEmbedding, labels = data.wordEmbedding, data.labelList
    posEmbedding = get_siusoid_encoding_table(config)
    
    #print(train_X.shape)
    #print(train_y.shape)
    #print(posEmbedding.shape)
    
    train_data = DataGenerator(train_X, train_y)
    eval_data = DataGenerator(eval_X, eval_y)
    pack_data = [train_data, eval_data]
    
    tf.reset_default_graph()
    
    ## 设置计算图的配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 
    
    sess = tf.Session(config=session_conf)
    
    # 创建一个模型
    model = Transfomer(config, wordEmbedding, posEmbedding)
    logger = Logger(sess, config)
    
    trainer = Trainer(sess, model, pack_data, config, logger)
    trainer.train_all()

In [35]:
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 100 | Loss: 1.3383203744888306 | Acc: 0.53968 : F1_Score: 0.61333
Evaluation —— Loss: 1.3322299718856812 | Precision: 0.0 | Recall: 0.0
Saving model...
Model saved
Train —— Step: 200 | Loss: 0.742250919342041 | Acc: 0.52381 : F1_Score: 0.28571
Evaluation —— Loss: 0.7258599996566772 | Precision: 0.50673 | Recall: 0.99723
Saving model...
Model saved
Train —— Step: 300 | Loss: 0.7877440452575684 | Acc: 0.39683 : F1_Score: 0.32143
Evaluation —— Loss: 0.6904000043869019 | Precision: 0.51759 | Recall: 0.98339
Saving model...
Model saved

当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 400 | Loss: 0.6745485663414001 | Acc: 0.61905 : F1_Score: 0.42857
Evaluation —— Loss: 0.5920699834823608 | Precision: 0.62135 | Recall: 0.96915
Saving model...
Model saved
Train —— Step: 500 | Loss: 0.45134687423706055 | Acc: 0.77778 : F1_Score: 0.76667
Evaluation —— Loss: 0.41802000999450684 | Precision: 0.79611 | Recall: 0.84177
Saving model...
Model saved
Train —— Step: 600 | Loss: 0.3355503976345062 | Acc: 0.87302 : F1_Score: 0.85185
Evaluation —— Loss: 0.4621100127696991 | Precision: 0.7388 | Recall: 0.92642
Saving model...
Model saved


## 3.2 使用Yelps数据集

In [32]:
def main():
    path = "../data/yelps/yelps.csv"
    config = Config(path)
    config['summary_dir'] = "../model/Transformer/yelps/summary"
    config['checkpoint_dir'] = "../model/Transformer/yelps/checkpoint"
    config['evaluateEvery'] = 400
    config['checkpointEvery'] = 400
    create_dirs([config["summary_dir"], config['checkpoint_dir']])
    data = Dataset(config)
    
    ## 生成训练集数据，第一个参数表示wordEmbedding文件所在的文件夹
    data.dataGen("../data/yelps", prefix="yelps")
    
    train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews,data.evalLabels
    wordEmbedding, labels = data.wordEmbedding, data.labelList
    posEmbedding = get_siusoid_encoding_table(config)
    
    #print(train_X.shape)
    #print(train_y.shape)
    #print(posEmbedding.shape)
    
    train_data = DataGenerator(train_X, train_y)
    eval_data = DataGenerator(eval_X, eval_y)
    pack_data = [train_data, eval_data]
    
    tf.reset_default_graph()
    
    ## 设置计算图的配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 
    
    sess = tf.Session(config=session_conf)
    
    # 创建一个模型
    model = Transfomer(config, wordEmbedding, posEmbedding)
    logger = Logger(sess, config)
    
    trainer = Trainer(sess, model, pack_data, config, logger)
    trainer.train_all()

In [33]:
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train —— Step: 400 | Loss: 0.7525515556335449 | Acc: 0.46032 : F1_Score: 0.55263
Evaluation —— Loss: 0.6349300146102905 | Precision: 0.70607 | Recall: 0.81712
Saving model...
Model saved
Train —— Step: 800 | Loss: 0.3563399016857147 | Acc: 0.84127 : F1_Score: 0.875
Evaluation —— Loss: 0.3035399913787842 | Precision: 0.87832 | Recall: 0.93405
Saving model...
Model saved
Train —— Step: 1200 | Loss: 0.300914466381073 | Acc: 0.90476 : F1_Score: 0.93023
Evaluation —— Loss: 0.28415998816490173 | Precision: 0.88684 | Recall: 0.9401
Saving model...
Model saved

当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train —— Step: 1600 | Loss: 0.4448325037956238 | Acc: 0.80952 : F1_Score: 0.84615
Evaluation —— Loss: 0.2841300070285797 | Precision: 0.89375 | Recall: 0.93602
Saving model...
Model saved
Train —— Step: 2000 | Loss: 0.2741104066371918 | Acc: 0.84127 : F1_Score: 0.88372
Evaluation —— Loss: 0.2786099910736084 | Precision: 0.91397 | Recall: 0.90119
Saving model...
Model saved
Train —— Step: 2400 | Loss: 0.16847646236419678 | Acc: 0.88889 : F1_Score: 0.91954
Evaluation —— Loss: 0.26903000473976135 | Precision: 0.90682 | Recall: 0.92114
Saving model...
Model saved


结论：在yelps数据集下能够得到还不错的效果，在IMDB数据集下效果相对一般；使用Transformer进行文本分类，超参数调节是一个比较重要的点，不然会出现梯度消失的情况，关键点是block不能太多，因为文本分类是一个比较简单的任务，不需要过于复杂的网络