In [1]:
from bilm import TokenBatcher, BidirectionalLanguageModel, weight_layers, dump_token_embeddings, Batcher

from tqdm.autonotebook import tqdm 



In [2]:
import tensorflow as tf 
import numpy as np 
import pandas as pd 

from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [3]:
from utils import *

# 1. 配置和加载数据

In [4]:
class Config(dict):
    def __init__(self, path):
        # 训练参数
        self['num_epochs'] = 10 
        self['learningRate'] = 0.01 
        self['decay_steps'] = 200
        self['decay_rate'] = 0.9 
        self['grad_clip'] = 5.0 
        self['evaluateEvery'] = 200 
        self['checkpointEvery'] = 200 

        # 模型参数
        self['embeddingSize'] = 256   # 也就是elmo最终输出的参数
        self['hiddenSizes'] = [128]     # LSTM神经元的个数
        self['dropoutProb'] = 0.5 
        self['l2RegLambda'] = 0.00 
        self['sequenceLength'] = 200
        self['batch_size'] = 64 

        # 基础参数
        self['dataSource'] = path
        self['stopWordSource'] = "../data/english"
        self['optionFile'] = "elmo_parameters/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        self['weightFile'] = "elmo_parameters/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        self['vocabFile'] = "elmo_parameters/vocab.txt"
        self['tokenEmbeddingFile'] = "elmo_parameters/elmo_token_embeddings.hdf5"
        self['numClasses'] = 1
        self['train_size'] = 0.8   # 训练集数据占比
        self.threshold = 0.5 
        
        # 保存模型参数
        self['checkpoint_dir'] = "../model/ELMO/imdb/checkpoint"
        self['summary_dir'] = "../model/ELMO/imdb/summary"
        self['max_to_keep'] = 5 

## 1.1 读取数据

In [5]:
class Dataset:
    def __init__(self, config):
        self._dataSource = config['dataSource']
        self._stopWordSource = config['stopWordSource']
        self._optionFile = config['optionFile']
        self._weightFile = config['weightFile']
        self._vocabFile = config['vocabFile']
        self._tokenEmbeddingFile = config['tokenEmbeddingFile']
        
        self._sequenceLength = config['sequenceLength']  # 每条输入的序列处理为定长
        self._embeddingSize = config['embeddingSize']
        self._bathSize = config['batch_size']
        self._rate = config['train_size']
        
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        
    def _readData(self, filePath):
        '''
        从csv文件中读取数据集
        '''
        print(1)
        df = pd.read_csv(filePath)
        review = df["review"].tolist()
        labels = None
        if "sentiment" in df.columns:
            labels = df['sentiment'].tolist()
        elif "emotion" in df.columns:
            labels = df['emotion'].tolist()
        
        reviews = [line.strip().split() for line in review]
        return reviews, labels
    
    def _genVocabFile(self, reviews):
        '''
        用训练数据生成一个词汇文件，并加入3个特殊字符
        '''
        print(2)
        allWords = [word for review in reviews for word in review]
        ## 统计词频
        wordCount = Counter(allWords)
        sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortWordCount if item[1] >= 3]
        allTokens = ["<S>", "</S>", "<UNK>"] + words
        with open(self._vocabFile, "w") as fout:
            fout.write("\n".join(allTokens))
    
    def _fixedSeq(self, reviews):
        '''
        将长度超过200的句子进行截断
        '''
        print(3)
        return [review[:self._sequenceLength] for review in reviews]
    
    def _genElmoEmbedding(self):
        '''
        调用ELMO源码中的dump_token_embeddings方法，基于字符的表示生成词向量表示
        并保存为hdf5文件，文件中embedding键对应的value就是词汇表文件中各词汇的向量表示
        这些词汇的向量表示之后会作为BiLM的初始化输入
        '''
        dump_token_embeddings(self._vocabFile, self._optionFile, self._weightFile,
                             self._tokenEmbeddingFile)
        
    def _genTrainEvalData(self, x, y, rate):
        '''
        生成训练集和验证集
        '''
        print(4)
        y = [[item] for item in y]
        trainIndex = int(len(x) * rate)
        trainReviews = x[:trainIndex]
        trainLabels = y[:trainIndex]
        evalReviews = x[trainIndex:]
        evalLabels = y[trainIndex:]
        
        return trainReviews, trainLabels, evalReviews, evalLabels
    
    def dataGen(self):
        '''
        初始化训练集和验证集
        '''
        # 初始化数据集
        reviews, labels = self._readData(self._dataSource)
        ## 生成词汇表文件
        self._genVocabFile(reviews)
        self._genElmoEmbedding()  ## 生成embedding文件
        
        reviews = self._fixedSeq(reviews)
        ## 初始化训练集和测试集
        trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviews, labels, self._rate)
        self.trainReviews = trainReviews
        self.trainLabels = trainLabels
        self.evalReviews = evalReviews
        self.evalLabels = evalLabels

# 2. 构建模型

In [6]:
class BiLSTMAttention(BaseModel):
    def __init__(self, config):
        super().__init__(config)
        self._init_elmo()
        self.build_model()
        self.init_saver()
        
    def build_model(self):
        # 输入层
        self.inputX = tf.placeholder(tf.float32, [None, self.config['sequenceLength'], self.config['embeddingSize']], name="inputX")
        self.inputY = tf.placeholder(tf.float32, [None], name="inputY")
        
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        

        # 定义embedding层
        with tf.name_scope("Embedding"):
            embeddingW = tf.get_variable("embeddingW", 
                                        shape=[self.config['embeddingSize'], self.config['embeddingSize']],
                                        initializer=tf.initializers.glorot_normal())
            reshapeInputX = tf.reshape(self.inputX, shape=[-1, self.config['embeddingSize']])
            self.embededWords = tf.reshape(tf.matmul(reshapeInputX, embeddingW), 
                                           shape=[-1, self.config['sequenceLength'], self.config['embeddingSize']])
        
            self.embededWords = tf.nn.dropout(self.embededWords, self.dropout_keep_prob)
        
        # 定义双向LSTM模型
        with tf.name_scope("Bi-LSTM"):
            for idx, hiddenSize in enumerate(self.config['hiddenSizes']):
                with tf.name_scope(f"Layer_{idx}"):
                    ## 定义前向的LSTM结构
                    lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize,
                                                                                  state_is_tuple=True),
                                                              output_keep_prob=self.dropout_keep_prob)
                    lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize,
                                                                                  state_is_tuple=True),
                                                              output_keep_prob=self.dropout_keep_prob)
                    
                    # 采用动态RNN
                    outputs_, current_state = tf.nn.bidirectional_dynamic_rnn(lstmFwCell, lstmBwCell,
                                                                             self.embededWords,
                                                                             dtype=tf.float32,
                                                                             scope=f"bi-lstm_{idx}")
                    # 对outputs的fw和bw结果进行拼接
                    ## [batch_size, seq_len, hidden_size*2]
                    self.embededWords = tf.concat(outputs_, 2)
                    
        ## 按照最后一个维度进行切分
        outputs  = tf.split(self.embededWords, 2, -1)
        with tf.name_scope("Attention"):
            ## 维度是 [batch, seq_len, hidden_size]
            H = outputs[0] + outputs[1]
            ## 得到Attention的输出
            output = self._attention(H)
            outputSize = self.config['hiddenSizes'][-1]
            
        # 输出层
        with tf.name_scope("output"):
            self.logits = tf.layers.dense(output, self.config['numClasses'],
                                         kernel_initializer=tf.initializers.glorot_normal(),
                                         bias_initializer=tf.initializers.constant(0.1))
            self.predictions = tf.nn.sigmoid(self.logits)
            
        # 损失的计算
        l2_loss = tf.constant(0.0)
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(tf.reshape(self.inputY, [-1, 1]),
                                                                           dtype=tf.float32),
                                                            logits=self.logits)
            self.loss = tf.reduce_mean(losses)
            if self.config['l2RegLambda']:
                with tf.variable_scope("dense", reuse=True):
                    outputW = tf.get_variable("kernel")
                    l2_loss += tf.nn.l2_loss(outputW)
                    
            self.loss += self.config['l2RegLambda'] * l2_loss
        
        
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            learning_rate = tf.train.exponential_decay(self.config['learningRate'],
                                                      self.global_step_tensor,
                                                      self.config['decay_steps'],
                                                      self.config['decay_rate'],
                                                      staircase=True)
            ## 使用梯度削减防止梯度爆炸
            optimizer = tf.train.AdamOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(self.loss)
            for idx, (grad, var) in enumerate(grads_and_vars):
                if grad is not None:
                    grads_and_vars[idx] = (tf.clip_by_norm(grad, self.config['grad_clip']), var)
                    
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step_tensor)
            
    # 定义注意力结构
    def _attention(self, H):
        # 获得最后一层LSTM的神经元数量
        hiddenSize = self.config['hiddenSizes'][-1]
        # 初始化key
        key = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1), name="key")
        # 对Bi-LSTM的结果进行激活
        M = tf.tanh(H)
        ## 形状 [batch*seq_len, 1]
        restoreM = tf.tensordot(M, key, axes=((2), (0)))
        #restoreM = tf.squeeze(newM, 2)
        # 用归一化除以得到 [batch, seq_len]
        self.alpha = tf.nn.softmax(restoreM)
        
        # 利用求得的权重对H进行加权求和
        r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.expand_dims(self.alpha, 2))
        output = tf.tanh(tf.squeeze(r, 2))
        # 进行dropout处理
        output = tf.nn.dropout(output, self.dropout_keep_prob)
        return output
    
    
    def _init_elmo(self):
        ## elmo层的输入
        self.inputData = tf.placeholder(tf.int32, [None, None], name="input_index")
        with tf.variable_scope("bilm", reuse=True):
            bilm = BidirectionalLanguageModel(self.config['optionFile'],
                                             self.config['weightFile'],
                                             use_character_inputs=False,
                                             embedding_weight_file=self.config['tokenEmbeddingFile'])
            #batcher = TokenBatcher(self.config['vocabFile'])
            # 生成batch数据
            #inputDataIndex = batcher.batch_sentences(self.inputX)
            inputEmbeddingOp = bilm(self.inputData)
            self.elmoEmbeddings = weight_layers('input', inputEmbeddingOp, l2_coef=0.0)
        
    
    def init_saver(self):
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

# ３. 训练模型

In [7]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super().__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
        
    def train_epoch(self):
        num_iter_per_epoch = self.train.length // self.config['batch_size']
        for i in tqdm(range(num_iter_per_epoch)):
            ## 获取训练结果
            loss, metrics, step = self.train_step()
            train_acc = metrics['accuracy']
            train_f_score = metrics['f_score']
            
            # 将训练过程的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc),
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer="train", scope="train_summary",
                                 summaries_dict=summaries_dict)
            if step % self.config['evaluateEvery'] == 0: 
                print("Train —— Step: {} | Loss: {} | Acc: {} : F1_Score: {}".format(
                    step, loss, train_acc, train_f_score))
                # 对测试集进行评估
                eval_losses = []
                eval_pred = []
                eval_true = []
                for batchEval in self.eval.iter_all(self.config['batch_size']):
                    loss, predictions = self.eval_step(batchEval)
                    eval_losses.append(loss)
                    eval_pred.extend(predictions)
                    eval_true.extend(batchEval[-1])
                getMetric = Metric(np.array(eval_pred), np.array(eval_true),
                                  self.config)
                metrics = getMetric.get_metrics()
                eval_prec = np.round(metrics['precision'], 5)
                eval_recall = np.round(metrics['recall'], 5)
                loss_mean = np.round(np.mean(eval_losses), 5)
                print("Evaluation —— Loss: {} | Precision: {} | Recall: {}".format(
                    loss_mean, eval_prec, eval_recall))
                summaries_dict = {"loss": np.array(loss_mean),
                                 "precision": np.array(eval_prec), 
                                 "recall": np.array(eval_recall)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
            if step % self.config['checkpointEvery'] == 0: 
                self.model.save(self.sess)
            
            
    def train_step(self):
        batch_x,  batch_y = next(self.train.next_batch(self.config['batch_size']))
        feed_dict = {self.model.inputX: elmo(batch_x),
                     #self.model.inputL: batch_len, 
                    self.model.inputY: batch_y,
                    self.model.dropout_keep_prob: self.config['dropoutProb']}
    
        _, loss, predictions, step = self.sess.run([self.model.train_op,
                                                   self.model.loss,
                                                   self.model.predictions, 
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)
        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
        return loss, metrics, step
    
    def eval_step(self, *batch):
        feed_dict = {self.model.inputX: elmo(batch[0]),
                     #self.model.inputL: batch[1],
                    self.model.inputY: batch[-1],
                    self.model.dropout_keep_prob: 1.0}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)
        return loss, predictions
    
    def elmo(self, reviews):
        batcher = TokenBatcher(self.config['vocabFile'])
        # 生成batch数据
        inputDataIndex = batcher.batch_sentences(reviews)
        # 计算ELMO向量表示
        elmoVec = self.sess.run(self.model.elmoEmbeddings, 
                                feed_dict={self.model.inputData: inputDataIndex})["weighted_op"]
        return elmoVec

# 4. 使用数据集训练

## 4.1 使用imdb数据集

In [8]:
path = "../data/imdb/labeldTrain.csv"
config = Config(path)
data = Dataset(config)

In [9]:
tf.reset_default_graph()
data.dataGen()

1
2
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use the `axis` argument instead
USING SKIP CONNECTIONS
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
3
4


In [10]:
train_X, train_y, eval_X, eval_y = np.array(data.trainReviews), np.array(data.trainLabels), np.array(data.evalReviews), np.array(data.evalLabels)

In [11]:
train_data = DataGenerator(train_y, train_X)
eval_data = DataGenerator(eval_y, eval_X)
pack_data = [train_data, eval_data]

## 设置计算图的配置
session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
session_conf.gpu_options.allow_growth = True
session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 

sess = tf.Session(config=session_conf)

# 创建模型
model = BiLSTMAttention(config)

KeyError: 'Unable to open object (component not found)'

In [59]:
logger = Logger(sess, config)

In [60]:
trainer = Trainer(sess, model, pack_data, config, logger)

In [61]:
trainer.train_all()


当前正处于第1次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第2次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第3次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第4次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第5次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第6次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第7次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第8次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第9次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



当前正处于第10次迭代


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


