In [1]:
import numpy as np 
import pandas as pd 

import json 
import warnings

warnings.filterwarnings("ignore")

In [2]:
from utils  import * 
from collections import Counter 
from tqdm.autonotebook import tqdm 

In [3]:
import tensorflow as tf 

# 1.定义基本的配置类

In [4]:
class Config(dict):
    def __init__(self, path=None):
        super().__init__()
        ## 定义训练参数
        self['num_epochs'] = 5 
        self['evaluateEvery'] = 100 
        self['checkpointEvery'] = 100 
        ## 学习率衰减
        self['learningRate'] = 0.03 
        self['decay_steps'] = 100    ## 学习率衰减的step
        self['decay_rate'] = 0.9  ## 学习率每次衰减的幅度
        self['grad_clip'] = 4.0   ## 梯度削减
        
        ## 定义模型参数
        self['embeddingSize'] = 200 
        #self['filterSizes'] = [3, 4, 5]
        #self['hiddenSize'] = 256
        self['kernelSize'] = 3    # 区域卷积的参数，类似于n-gram
        #self['cellSize'] = 50
        self['num_filters'] = 250  # 表示区域嵌入层滤波器的个数
        #self['strides'] = 2 
        self['dropoutProb'] = 0.7
        self['l2RegLambda'] = 0.0001
        
        ## 定义基础参数
        self['sequenceLength'] = 200 
        self['batch_size'] = 64
        self['dataSource'] = path
        self['stopWordSource'] = "../data/english"
        self['numClasses'] = 1 
        self['train_size'] = 0.8  # 训练集所占的比例
        self.threshold = 0.5 
        
        ## 保存模型参数
        self['checkpoint_dir'] = "../model/DPCNN/imdb/checkpoint"
        self['summary_dir'] = "../model/DPCNN/imdb/summary"
        self['max_to_keep'] = 5 

# 2. 定义模型类和训练类

## 2.1 定义模型类

In [5]:
class DPCNN(BaseModel):
    def __init__(self, config, wordEmbedding):
        super().__init__(config)
        self.wordEmbedding = wordEmbedding
        self.initializer = tf.contrib.layers.xavier_initializer()
        self.build_model()
        self.init_saver()
        
    def build_model(self):
        # 输入层
        self.inputX= tf.placeholder(tf.int32, [None, self.config['sequenceLength']], name="inputX")
        self.inputY = tf.placeholder(tf.float32, [None], name="inputY")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        self.is_training = tf.placeholder(tf.bool, name="is_training")
        
        
        # 初始化参数
        self.init_weights()
        
        # embedding层
        self.embeded_words = tf.nn.embedding_lookup(self.embeddings, self.inputX, name="look_up")
        ## 扩展通道维度，符合conv的输入要求，形状[batch, seq, embedding, 1]
        self.embeded_words = tf.expand_dims(self.embeded_words, axis=-1)
        ## 区域embedding，相当于n-gram，形状[batch, seq-3+1, 1, 250]
        region_embedding = tf.nn.conv2d(self.embeded_words, self.region_w, strides=[1,1,1,1],
                                       padding="VALID")
        region_activation = tf.nn.relu(region_embedding, name="region_relu")  # 激活
        
        # 连续两层conv，使用深度网络提取特征
        ## 第一层，输出形状[batch, seq-3+1, 1, 250]
        conv3 = self.conv3(0, region_activation)
        conv3 = tf.layers.batch_normalization(conv3, training=self.is_training)
        pre_activation = tf.nn.relu(conv3, name="pre_activation")
        ## 第二层，输出形状[batch, seq-3+1, 1, 250]
        conv3 = self.conv3(1, pre_activation)
        conv3 = tf.layers.batch_normalization(conv3, training=self.is_training)
        ## 使用残差连接，将第二层的输出结果和区域embedding结果相加
        ## 输出为[batch, seq-3+1, 1, 250]
        conv3 = conv3 + region_embedding
        
        # 池化层和卷积层
        ## 一直到seq_len维度的大小变为1，否则一直运算
        k = 1 
        while conv3.get_shape().as_list()[1] >= 2:
            conv3, k = self._block(conv3, k)
        # 形状 [batch, 250]
        conv3 = tf.squeeze(conv3, [1, 2])
        conv3 = tf.nn.dropout(conv3, self.dropout_keep_prob)
        
        with tf.name_scope("output"):
            self.logits = tf.matmul(conv3, self.w_projection) + self.b_projection
            self.predictions = tf.nn.sigmoid(self.logits)
            
        # 计算损失
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(tf.reshape(self.inputY, [-1, 1]), dtype=tf.float32), 
                                                             logits=self.logits)
            self.loss = tf.reduce_mean(losses)
            ## 计算l2正则化损失
            if self.config['l2RegLambda'] > 0: 
                l2_loss = tf.add_n([tf.nn.l2_loss(cand_var) 
                                    for cand_var in tf.trainable_variables() 
                                    if "bias" not in cand_var.name and "Embedding" not in cand_var.name])
                self.loss += self.config['l2RegLambda'] * l2_loss
                
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            learning_rate = tf.train.exponential_decay(self.config['learningRate'], self.global_step_tensor,
                                                      self.config['decay_steps'],
                                                      self.config['decay_rate'],
                                                      staircase=True)
            ## 使用梯度削减防止出现梯度爆炸
            optimizer = tf.train.AdamOptimizer(learning_rate)
            
            grads_and_vars = optimizer.compute_gradients(self.loss)
            for idx, (grad, var) in enumerate(grads_and_vars):
                if grad is not None:
                    grads_and_vars[idx] = (tf.clip_by_norm(grad, self.config['grad_clip']), var)
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step_tensor)
            
        
    def init_weights(self):
        with tf.name_scope("weights"):
            self.embeddings = tf.Variable(tf.cast(self.wordEmbedding, dtype=tf.float32, name="word2vec"),
                                         name="wordEmbedding", trainable=False)
            
            ## 区域embedding层的参数
            ## 也就是卷积核的参数，filter形状对应的位置是 [filter_height, filter_width, in_channels, out_channels]
            self.region_w = tf.get_variable("W_region", [self.config['kernelSize'], self.config['embeddingSize'], 
                                                         1, self.config['num_filters']],
                                           initializer=self.initializer, dtype=tf.float32)
            self.w_projection = tf.get_variable("W_projection", [self.config['num_filters'], self.config['numClasses']],
                                               initializer=self.initializer, dtype=tf.float32)
            self.b_projection = tf.Variable(tf.constant(0.0, shape=[self.config['numClasses']]), 
                                            dtype=tf.float32, name="b_projection")
            
    
    # 定义之后的卷积层
    def conv3(self, k, input_):
        conv3_w = tf.get_variable(f"W_conv{k}", 
                                 [self.config['kernelSize'], 1, self.config['num_filters'], self.config['num_filters']],
                                 initializer=self.initializer, dtype=tf.float32)
        conv = tf.nn.conv2d(input_, conv3_w, strides=[1,1,1,1], padding="SAME")
        return conv
    
    # 表示池化之后的结果
    def _block(self, x, k):
        # 这里进行padding，参数paddings的大小表示x的维度
        ## 元素表示该维度左右各padding多少个0
        ## 下面表示对第0,2,3维不使用padding，对第1维的下侧使用1次padding
        x = tf.pad(x, paddings=[[0, 0], [0, 1], [0, 0], [0, 0]])
        
        # 最大池化，kernel的维度和输入对应，这里表示在height维度每3个取最大值，step=2
        px = tf.nn.max_pool(x, [1, 3, 1, 1], strides=[1, 2, 1, 1], padding="VALID")
        
        ## 紧接着两个卷积层
        ### 第一层
        k += 1 
        x = tf.nn.relu(px)
        x = self.conv3(k, x)
        x = tf.layers.batch_normalization(x, training=self.is_training)
        
        ### 第二层
        k += 1 
        x = tf.nn.relu(x)
        x = self.conv3(k, x)
        x = tf.layers.batch_normalization(x, training=self.is_training)
        ### 残差连接
        x = x + px
        return x, k
    
    def init_saver(self):
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

## 2.2 定义训练类

In [6]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super().__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        
    def train_epoch(self):
        num_iter_per_epoch = self.train.length // self.config["batch_size"]
        for _ in tqdm(range(num_iter_per_epoch)):
            ## 获取训练结果
            loss, metrics, step = self.train_step()
            train_acc = metrics["accuracy"]
            train_f_score = metrics['f_score']
            
            ## 将训练过程的损失写入
            summaries_dict = {"loss": loss, 
                             "acc": np.array(train_acc),
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer="train", scope="train_summary",
                                 summaries_dict=summaries_dict)
            if step % self.config['evaluateEvery'] == 0: 
                print("Train —— Step: {} | Loss: {} | Acc: {} | F1_Score: {}".format(
                    step, loss, train_acc, train_f_score))
                ## 对测试集进行评估
                eval_losses = []
                eval_pred = []
                eval_true = []
                for batchEval in self.eval.iter_all(self.config['batch_size']):
                    loss, predictions = self.eval_step(batchEval[0], batchEval[1])
                    eval_losses.append(loss)
                    eval_pred.extend(predictions)
                    eval_true.extend(batchEval[-1])
                getMetric = Metric(np.array(eval_pred), np.array(eval_true),
                                  self.config)
                metrics = getMetric.get_metrics()
                eval_prec = np.round(metrics['precision'], 5)
                eval_recall = np.round(metrics['recall'], 5)
                loss_mean = np.round(np.mean(eval_losses), 5)
                
                print("Evaluation —— Loss: {} | Precision: {} | Recall: {}".format(
                    loss_mean, eval_prec, eval_recall))
                summaries_dict = {"loss": np.array(loss_mean),
                                 "precision": np.array(eval_prec), 
                                 "recall": np.array(eval_recall)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
            
            if step % self.config['checkpointEvery'] == 0: 
                self.model.save(self.sess)
            
        
    def train_step(self):
        batch_x, batch_y = next(self.train.next_batch(self.config["batch_size"]))
        feed_dict = {self.model.inputX: batch_x,
                    self.model.inputY: batch_y,
                    self.model.dropout_keep_prob: self.config['dropoutProb'],
                    self.model.is_training: True}
        _, loss, predictions, step = self.sess.run([self.model.train_op, 
                                                   self.model.loss,
                                                   self.model.predictions,
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)
        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
        return loss, metrics, step
    
    def eval_step(self, batch_x, batch_y):
        feed_dict = {self.model.inputX: batch_x,
                    self.model.inputY: batch_y,
                    self.model.dropout_keep_prob: 1.0,
                    self.model.is_training: False}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)
        return loss, predictions

# 3. 使用数据进行训练

## 3.1 使用IMDB数据集

In [7]:
def main():
    path = "../data/imdb/labeldTrain.csv"
    config = Config(path)
    create_dirs([config["summary_dir"], config['checkpoint_dir']])
    data = Dataset(config)
    
    ## 生成训练集数据，第一个参数表示wordEmbedding文件所在的文件夹
    data.dataGen("../data/imdb", prefix="imdb")
    
    train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews, data.evalLabels
    wordEmbedding, labels = data.wordEmbedding, data.labelList
    
    train_data = DataGenerator(train_X, train_y)
    eval_data = DataGenerator(eval_X, eval_y)
    pack_data = [train_data, eval_data]
    
    tf.reset_default_graph()
    ## 设置计算图的配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 
    
    sess = tf.Session(config = session_conf)
    
    ## 创建一个实例
    model = DPCNN(config, wordEmbedding)
    
    logger = Logger(sess, config)
    
    trainer = Trainer(sess, model, pack_data, config, logger)
    
    trainer.train_all()

In [8]:
%%time
main()

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.batch_normalization instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 100 | Loss: 0.8796542882919312 | Acc: 0.71429 | F1_Score: 0.72727
Evaluation —— Loss: 1.6098599433898926 | Precision: 0.54303 | Recall: 0.99842
Saving model...
Model saved
Train —— Step: 200 | Loss: 0.3381098210811615 | Acc: 0.87302 | F1_Score: 0.86207
Evaluation —— Loss: 1.128600001335144 | Precision: 0.57256 | Recall: 0.99881
Saving model...
Model saved
Train —— Step: 300 | Loss: 0.29366499185562134 | Acc: 0.84127 | F1_Score: 0.82143
Evaluation —— Loss: 0.711870014667511 | Precision: 0.64238 | Recall: 0.99051
Saving model...
Model saved


当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 400 | Loss: 0.37266865372657776 | Acc: 0.88889 | F1_Score: 0.88889
Evaluation —— Loss: 0.3429099917411804 | Precision: 0.80134 | Recall: 0.94462
Saving model...
Model saved
Train —— Step: 500 | Loss: 0.2201843112707138 | Acc: 0.85714 | F1_Score: 0.85714
Evaluation —— Loss: 1.3241499662399292 | Precision: 0.59971 | Recall: 0.99565
Saving model...
Model saved
Train —— Step: 600 | Loss: 0.2051265835762024 | Acc: 0.95238 | F1_Score: 0.95082
Evaluation —— Loss: 0.3986800014972687 | Precision: 0.81871 | Recall: 0.94146
Saving model...
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Model saved


当前正处于第3次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 700 | Loss: 0.16066674888134003 | Acc: 0.95238 | F1_Score: 0.95082
Evaluation —— Loss: 0.3450799882411957 | Precision: 0.91648 | Recall: 0.78995
Saving model...
Model saved
Train —— Step: 800 | Loss: 0.4091871976852417 | Acc: 0.88889 | F1_Score: 0.88525
Evaluation —— Loss: 0.36278998851776123 | Precision: 0.87406 | Recall: 0.87302
Saving model...
Model saved
Train —— Step: 900 | Loss: 0.209562748670578 | Acc: 0.88889 | F1_Score: 0.89231
Evaluation —— Loss: 0.48730000853538513 | Precision: 0.78408 | Recall: 0.95095
Saving model...
Model saved


当前正处于第4次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1000 | Loss: 0.07596436142921448 | Acc: 0.98413 | F1_Score: 0.98361
Evaluation —— Loss: 0.35756999254226685 | Precision: 0.86394 | Recall: 0.87658
Saving model...
Model saved
Train —— Step: 1100 | Loss: 0.0929764062166214 | Acc: 0.98413 | F1_Score: 0.98361
Evaluation —— Loss: 0.3626900017261505 | Precision: 0.8766 | Recall: 0.86828
Saving model...
Model saved
Train —— Step: 1200 | Loss: 0.09591948986053467 | Acc: 0.98413 | F1_Score: 0.98361
Evaluation —— Loss: 0.45041000843048096 | Precision: 0.90166 | Recall: 0.83782
Saving model...
Model saved


当前正处于第5次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1300 | Loss: 0.09038379788398743 | Acc: 0.96825 | F1_Score: 0.96774
Evaluation —— Loss: 0.3905400037765503 | Precision: 0.8663 | Recall: 0.87658
Saving model...
Model saved
Train —— Step: 1400 | Loss: 0.11743038892745972 | Acc: 0.96825 | F1_Score: 0.96875
Evaluation —— Loss: 0.6920199990272522 | Precision: 0.74977 | Recall: 0.96242
Saving model...
Model saved
Train —— Step: 1500 | Loss: 0.11857536435127258 | Acc: 0.98413 | F1_Score: 0.98413
Evaluation —— Loss: 0.4328399896621704 | Precision: 0.85862 | Recall: 0.88884
Saving model...
Model saved

CPU times: user 1min 20s, sys: 18.1 s, total: 1min 38s
Wall time: 1min 51s


## 3.2 使用Yelps数据集

In [9]:
def main():
    path = "../data/yelps/yelps.csv"
    config = Config(path)
    config['summary_dir'] = "../model/DPCNN/yelps/summary"
    config['checkpoint_dir'] = "../model/DPCNN/yelps/checkpoint"
    config['evaluateEvery'] = 400
    config['checkpointEvery'] = 400
    create_dirs([config["summary_dir"], config['checkpoint_dir']])
    
    data = Dataset(config)
    
    ## 生成训练集数据，第一个参数表示wordEmbedding文件所在的文件夹
    data.dataGen("../data/yelps/", prefix="yelps")
    
    train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews, data.evalLabels
    wordEmbedding, labels = data.wordEmbedding, data.labelList
    
    train_data = DataGenerator(train_X, train_y)
    eval_data = DataGenerator(eval_X, eval_y)
    pack_data = [train_data, eval_data]
    
    tf.reset_default_graph()
    ## 设置计算图的配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 
    
    sess = tf.Session(config = session_conf)
    
    ## 创建一个实例
    model = DPCNN(config, wordEmbedding)
    
    logger = Logger(sess, config)
    
    trainer = Trainer(sess, model, pack_data, config, logger)
    
    trainer.train_all()

In [10]:
%%time
main()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train —— Step: 400 | Loss: 0.33918890357017517 | Acc: 0.87302 | F1_Score: 0.90909
Evaluation —— Loss: 0.27199000120162964 | Precision: 0.89403 | Recall: 0.94893
Saving model...
Model saved
Train —— Step: 800 | Loss: 0.12130420655012131 | Acc: 0.95238 | F1_Score: 0.96386
Evaluation —— Loss: 0.2712399959564209 | Precision: 0.93009 | Recall: 0.8984
Saving model...
Model saved
Train —— Step: 1200 | Loss: 0.19015341997146606 | Acc: 0.92063 | F1_Score: 0.94118
Evaluation —— Loss: 0.2515299916267395 | Precision: 0.92287 | Recall: 0.91917
Saving model...
Model saved


当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train —— Step: 1600 | Loss: 0.40384572744369507 | Acc: 0.96825 | F1_Score: 0.97674
Evaluation —— Loss: 0.2439900040626526 | Precision: 0.92951 | Recall: 0.91842
Saving model...
Model saved
Train —— Step: 2000 | Loss: 0.13859966397285461 | Acc: 0.90476 | F1_Score: 0.92857
Evaluation —— Loss: 0.2459000051021576 | Precision: 0.89305 | Recall: 0.96261
Saving model...
Model saved
Train —— Step: 2400 | Loss: 0.11533189564943314 | Acc: 0.95238 | F1_Score: 0.96386
Evaluation —— Loss: 0.25227001309394836 | Precision: 0.93108 | Recall: 0.91026
Saving model...
Model saved


当前正处于第3次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train —— Step: 2800 | Loss: 0.2902209758758545 | Acc: 0.90476 | F1_Score: 0.93023
Evaluation —— Loss: 0.24240000545978546 | Precision: 0.92781 | Recall: 0.92136
Saving model...
Model saved
Train —— Step: 3200 | Loss: 0.11075884103775024 | Acc: 0.96825 | F1_Score: 0.97619
Evaluation —— Loss: 0.25328999757766724 | Precision: 0.91905 | Recall: 0.93564
Saving model...
Model saved
Train —— Step: 3600 | Loss: 0.13583506643772125 | Acc: 0.93651 | F1_Score: 0.95238
Evaluation —— Loss: 0.25349000096321106 | Precision: 0.92595 | Recall: 0.92756
Saving model...
Model saved


当前正处于第4次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train —— Step: 4000 | Loss: 0.17429818212985992 | Acc: 0.95238 | F1_Score: 0.96552
Evaluation —— Loss: 0.2647300064563751 | Precision: 0.9153 | Recall: 0.9404
Saving model...
Model saved
Train —— Step: 4400 | Loss: 0.05103440582752228 | Acc: 0.98413 | F1_Score: 0.98795
Evaluation —— Loss: 0.2655700147151947 | Precision: 0.91883 | Recall: 0.93458
Saving model...
Model saved
Train —— Step: 4800 | Loss: 0.10332784056663513 | Acc: 0.95238 | F1_Score: 0.96552
Evaluation —— Loss: 0.2702000141143799 | Precision: 0.91969 | Recall: 0.92997
Saving model...
Model saved


当前正处于第5次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train —— Step: 5200 | Loss: 0.10885898768901825 | Acc: 0.96825 | F1_Score: 0.97619
Evaluation —— Loss: 0.2759700119495392 | Precision: 0.91292 | Recall: 0.94161
Saving model...
Model saved
Train —— Step: 5600 | Loss: 0.03490706905722618 | Acc: 1.0 | F1_Score: 1.0
Evaluation —— Loss: 0.2803899943828583 | Precision: 0.91811 | Recall: 0.93247
Saving model...
Model saved
Train —— Step: 6000 | Loss: 0.11550605297088623 | Acc: 0.93651 | F1_Score: 0.95349
Evaluation —— Loss: 0.2799200117588043 | Precision: 0.9161 | Recall: 0.93534
Saving model...
Model saved

CPU times: user 7min 50s, sys: 1min 55s, total: 9min 45s
Wall time: 11min 36s
