In [1]:
import os 
import pandas as pd 
import time 
import datetime 
import logging
import json 
import random
import math

import warnings
warnings.filterwarnings("ignore")

from tqdm.autonotebook import tqdm
from collections import Counter
import gensim
import numpy as np 
import tensorflow as tf 


# 定义参数配置类

In [16]:
class Config(dict):
    def __init__(self, path=None):
        super().__init__()
        ## 定义训练参数
        self['num_epochs'] = 5 
        self['evaluateEvery'] = 100 
        self['checkpointEvery'] = 100 
        self['learningRate'] = 0.001 
        
        ## 定义模型参数
        self['embeddingSize'] = 200 
        self['numFilters'] = 128 
        self['filterSizes'] = [2, 3, 4, 5]
        self['dropoutProb'] = 0.5 
        self['l2RegLambda'] = 0.0 
        
        ## 定义基础参数
        self['sequenceLength'] = 200 
        self['batch_size'] = 64 
        self['dataSource'] = path
        self['stopWordSource'] = "../data/english"
        self['numClasses'] = 1  
        self['train_size'] = 0.8   # 训练集和测试集比例
        
        ## 保存模型参数
        self['checkpoint_dir'] = "../model/textCNN/checkpoint"
        self['summary_dir'] = "../model/textCNN/summary"
        self['max_to_keep'] = 5

In [3]:
# 定义数据预处理类
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self._dataSource = config['dataSource']
        self._stopWordSource = config['stopWordSource']
        
        self._sequenceLength = config['sequenceLength'] # 设置序列的输入藏毒
        self._embeddingSize = config['embeddingSize']
        self._batchSize = config['batch_size']
        self._trainRate = config['train_size']
        
        self._stopWordDict = {}
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        
        self.wordEmbedding = None
        self.labelList = []
        
    def _readData(self, filePath):
        '''
        从csv文件中读取数据集
        '''
        df = pd.read_csv(filePath)
        if self.config['numClasses'] == 1:          
            if "sentiment" in df.columns:
                labels = df["sentiment"].tolist()
            if "emotion" in df.columns:
                labels = df["emotion"].tolist()
        
        elif self.config['numClasses'] > 1: 
            labels = df["rate"].tolist()
        
        review = df['review'].tolist()
        reviews = [line.strip().split() for line in review]
        
        return reviews, labels
    
    def _laeblToIndex(self, labels, label2idx):
        '''
        将标签转换为索引表示
        '''
        labelIds = [label2idx[label] for label in labels]
        return labelIds
    
    def _wordToIndex(self, reviews, word2idx):
        '''
        将词转换为索引表示
        '''
        reviewIds = [[word2idx.get(item, word2idx["UNK"]) for item in review] for review in reviews]
        return reviewIds
    
    def _genTrainEvalData(self, x, y, word2idx, rate):
        '''
        生成训练集和验证集
        '''
        reviews = []
        for review in x: 
            if len(review) >= self._sequenceLength:
                reviews.append(review[:self._sequenceLength])
            else:
                reviews.append(review + [word2idx["PAD"]] * (self._sequenceLength - len(review)))
        
        trainIndex = int(len(x) * rate)
        
        trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
        trainLabels = np.array(y[:trainIndex], dtype="float32")
        
        evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
        evalLabels = np.array(y[trainIndex:], dtype="float32")
        
        return trainReviews, trainLabels, evalReviews, evalLabels
    
    def _genVocabulary(self, reviews, labels, path, prefix=""):
        '''
        生成向量和词汇-索引映射字典
        '''
        allWords = [word for review in reviews for word in review]
        # 去掉停用词
        subWords = [word for word in allWords if word not in self.stopWordDict]
        wordCount = Counter(subWords)  # 统计各个词的词频
        sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        
        # 去除低频词
        words = [item[0] for item in sortWordCount if item[1] >= 5]
        
        
        vocab, wordEmbedding = self._getWordEmbedding(words, path)
        self.wordEmbedding = wordEmbedding
        
        #print(len(vocab), vocab[10])
        word2idx = dict(zip(vocab, range(len(vocab))))
        
        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))
        
        # 将词汇表-索引映射表保存为json数据，之后inference时直接加载处理数据
        save_path = "../data/wordJson"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with open(os.path.join(save_path, prefix+"word2idx.json"), "w", encoding="utf8") as f: 
            json.dump(word2idx, f)
        
        with open(os.path.join(save_path, prefix+"label2idx.json"), "w", encoding="utf8") as f: 
            json.dump(label2idx, f)
        
        return word2idx, label2idx
    
    def _getWordEmbedding(self, words, path):
        '''
        按照数据集中的单词去除训练好的词向量
        '''
        wordVec = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(path, "wordvector.bin"),
                                                                 binary=True)
        
        vocab = []
        wordEmbedding = []
        
        # 添加"pad"和"UNK"
        vocab.append("PAD")
        vocab.append("UNK")
        
        wordEmbedding.append(np.zeros(self._embeddingSize))  # 表示对"PAD"用全0向量表示
        wordEmbedding.append(np.random.randn(self._embeddingSize))  # 对"UNK"用随机向量表示
        
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                pass
        
        return vocab, np.array(wordEmbedding)
    
    def _readStopword(self, stopWordPath):
        '''
        读取停用词
        '''
        with open(stopWordPath, "r") as f: 
            stopWords = f.read()
            stopWordList = stopWords.splitlines()
            # 转换成字典的形式，使用hash查找效率更高
            self.stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
            
    def dataGen(self, path, prefix=""):
        '''
        初始化训练集和验证集 
        prefix: 表示生成单词到索引的文件的前缀
        path: 表示wordvector文件的位置
        '''
        # 初始化停用词
        self._readStopword(self._stopWordSource)
        # 初始化数据集
        reviews, labels = self._readData(self._dataSource)
        # 初始化词汇-索引映射表和词向量矩阵
        word2idx, label2idx = self._genVocabulary(reviews, labels, path, prefix)
        # 将标签和句子数值化
        labelIds = self._laeblToIndex(labels, label2idx)
        reviewsIds = self._wordToIndex(reviews, word2idx)
        
        # 初始化训练集和测试集
        trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviewsIds, 
                                                                                  labelIds,
                                                                                  word2idx, 
                                                                                  self._trainRate)
        self.trainReviews = trainReviews
        self.trainLabels = trainLabels
        self.evalReviews = evalReviews
        self.evalLabels = evalLabels

# 构建模型

## 定义数据的迭代类

In [4]:

class DataGenerator:
    def __init__(self, x, y):
        self.x = x 
        self.y = y
        self.length = len(y)

    def next_batch(self, batch_size):
        '''
        生成每一个batch的数据集
        '''
        idx = np.random.choice(self.length, batch_size)
        yield self.x[idx], self.y[idx]
        
    def iter_all(self, batch_size):
        '''
        按照batch迭代所有数据 
        '''        
        numBatches = self.length // batch_size
        for i in range(numBatches):
            start = i * batch_size
            end = start + batch_size
            batchX = np.array(self.x[start:end], dtype='int64')
            batchY = np.array(self.y[start:end], dtype="float32")
            yield batchX, batchY

## 定义每个模型都要继承的基类

In [5]:
class BaseModel:
    def __init__(self, config):
        self.config = config 
        self.init_global_step()
        self.init_cur_epoch()
        
    def save(self, sess):
        print("Saving model...")
        self.saver.save(sess, self.config['checkpoint_dir']+"/my_model", self.global_step_tensor)
        print("Model saved")
        
    def load(self, sess):
        ## 获取最近的chekpoint
        latest_checkpoint = tf.train.latest_checkpoint(self.config['checkpoint_dir'])
        if latest_checkpoint:
            print("Loading model checkpoint {} ... \n".format(latest_checkpoint))
            self.saver.restore(sess, latest_checkpoint)
            print("Model loaded")
    
    # 表示每执行一个epoch，对应的变量+1
    def init_cur_epoch(self):
        with tf.variable_scope("cur_epoch"):
            self.cur_epoch_tensor = tf.Variable(0, trainable=False, name="cur_epoch")
            self.increment_cur_epoch_tensor = tf.assign(self.cur_epoch_tensor, self.cur_epoch_tensor+1)
    
    def init_global_step(self):
        # 表示当前模型一共迭代的step
        ## 每次执行都需要放到trainer里面
        with tf.variable_scope("global_step"):
            self.global_step_tensor = tf.Variable(0, trainable=False, name="global_step")
    
    def init_saver(self):
        # 通常只需要在子类中拷贝下面的语句即可
        # self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])
        raise NotImplementedError
        
    def build_model(self):
        raise NotImplementedError

In [6]:
class BaseTrain:
    def __init__(self, sess, model, data, config, logger):
        self.model = model 
        self.logger = logger
        self.config = config
        self.data = data 
        self.sess = sess
        self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        self.sess.run(self.init)
        
    def train_all(self):
        for cur_epoch in range(self.model.cur_epoch_tensor.eval(self.sess), self.config['num_epochs']+1, 1):
            print(f"\n当前正处于第{cur_epoch+1}次迭代")
            self.train_epoch()
            ## 将对应的epoch+1
            self.sess.run(self.model.increment_cur_epoch_tensor)
    
    def train_epoch(self):
        '''
        实现一个epoch训练的代码
        - 在config规定的迭代次数上迭代，调用train_step
        - 添加summary
        '''
        raise NotImplementedError
    
    def train_step(self):
        '''
        实现单步训练的逻辑代码
        '''
        raise NotImplementedError

## 定义记录训练过程中一些信息的类

In [7]:
class Logger:
    def __init__(self, sess, config):
        self.sess = sess 
        self.config = config 
        self.summary_placeholders = {}
        self.summary_ops = {}
        self.train_sumary_writer = tf.summary.FileWriter(os.path.join(self.config['summary_dir'], "train"),
                                                        self.sess.graph)
        self.test_summary_writer = tf.summary.FileWriter(os.path.join(self.config['summary_dir'], "test"))
        
    # 保存scalars和images
    def summarize(self, step, summarizer="train", scope="", summaries_dict=None):
        '''
        step: 表示summary的时间步
        summarizer: 表示使用 train 还是 test
        scope: 表示变量空间名 
        summaries_dict: 表示要summaries的值，格式是(tag, value)
        '''
        summary_writer = self.train_sumary_writer if summarizer == "train" else self.test_summary_writer
        with tf.variable_scope(scope):
            if summaries_dict is not None:
                summary_list = []
                for tag, value in summaries_dict.items():
                    if tag not in self.summary_ops:
                        if len(value.shape) <= 1:
                            self.summary_placeholders[tag] = tf.placeholder(tf.float32,shape=value.shape, name=tag)
                        else:
                            self.summary_placeholders[tag] = tf.placeholder("float32", 
                                                                            [None]+list(value.shape[1:]), 
                                                                           name=tag)                                     
                        if len(value.shape) <= 1:
                            ## 添加标量
                            self.summary_ops[tag] = tf.summary.scalar(tag, self.summary_placeholders[tag])
                        else:
                            ## 添加为图片
                            self.summary_ops[tag] = tf.summary.image(tag, self.summary_placeholders[tag])

                    summary_list.append(self.sess.run(self.summary_ops[tag], 
                                                      {self.summary_placeholders[tag]: value}))
                for summary in summary_list:
                    summary_writer.add_summary(summary, step)
                summary_writer.flush()

## 定义模型评估指标

In [8]:
class Metric(object):
    def __init__(self, pred_y, true_y, labels=None):
        self.pred_y = pred_y
        self.true_y = true_y
        self.labels = labels
    
    @classmethod
    def mean(cls, item: list) -> float:
        '''
        定义计算列表元素均值的函数
        '''
        res = sum(item) / len(item) if len(item) > 0 else 0
        return round(res, 5) 
    
    def accuracy(self):
        '''
        计算二类和多类的准确率
        '''
        p = self.pred_y
        t = self.true_y
        if isinstance(p[0], list):
            p = [item[0] for item in p]
        corr = 0 
        for i in range(len(p)):
            if p[i] == t[i]:
                corr += 1 
        acc = corr / len(p) if len(p) > 0 else 0 
        return round(acc, 5)

    def binary_precision(self, positive=1):
        '''
        二类精确率的计算 
        '''
        p = self.pred_y
        t = self.true_y
        if isinstance(p[0], list):
            p = [item[0] for item in p]
        corr = 0 
        pred_corr = 0 
        for i in range(len(p)):
            if p[i] == positive:
                pred_corr += 1 
                if p[i] == t[i]:
                    corr += 1 
        prec = corr / pred_corr if pred_corr > 0 else 0 
        return round(prec, 5)

    def binary_recall(self, positive=1):
        '''
        二类召回率的计算 
        '''
        p = self.pred_y
        t = self.true_y
        if isinstance(p[0], list):
            p = [item[0] for item in p]
        corr = 0 
        true_corr = 0
        for i in range(len(p)):
            if t[i] == positive:
                true_corr += 1 
                if p[i] == t[i]:
                    corr += 1 
        rec = corr / true_corr if true_corr > 0 else 0 
        return round(rec, 5)

    def binary_f_beta(self, beta=1.0, positive=1):
        '''
        二类的f_beta的计算
        '''
        precision = self.binary_precision(positive)
        recall = self.binary_recall(positive)
        try:
            f_b = (1+ beta*beta) * precision * recall / (beta*beta*precision + recall)
        except:
            f_b = 0 
        return round(f_b, 5)
    
    def multi_precision(self):
        '''
        多类精确率的计算
        '''
        precisions = [self.binary_precision(label) for label in self.labels]
        prec = mean(precisions)
        return round(prec, 5)
    
    def multi_recall(self):
        '''
        多类召回率的计算 
        '''
        recalls = [self.binary_recall(label) for label in self.labels]
        rec = mean(recalls)
        return round(rec, 5)
    
    def multi_f_beta(self, beta=1.0):
        '''
        多类f_beta的计算
        '''
        f_betas = [self.binary_f_beta(beta, label) for label in labels]
        f_beta = mean(f_betas)
        return round(f_beta, 5)
    
    def get_binary_metrics(self, f_beta=1.0):
        '''
        得到二类的性能指标 
        '''
        metrics = {"accuracy": self.accuracy(), "recall": self.binary_recall(),
                  "precision": self.binary_precision(), "f_beta": self.binary_f_beta(f_beta)}
        return metrics
    
    def get_multi_metrics(self, f_beta=1.0):
        '''
        得到多类的性能指标 
        '''
        metrics = {"accuracy": self.accuracy(), "recall": self.multi_recall(), 
                  "precision": self.multi_precision(), "f_beta": self.multi_f_beta(f_beta)}
        return metrics

## 定义模型类

In [9]:
class TextCNN(BaseModel):
    def __init__(self, config, wordEmbedding):
        super(TextCNN, self).__init__(config)
        self.wordEmbedding = wordEmbedding
        self.build_model()
        self.init_saver()
        
    def build_model(self):
        # 定义模型的输入
        self.inputX = tf.placeholder(tf.int32, [None, self.config['sequenceLength']], name="inputX")
        self.inputY = tf.placeholder(tf.int32, [None], name="inputY")
        
        self.dropoutProb = tf.placeholder(tf.float32, name="dropoutProb")
        
        
        # 定义L2损失值
        l2Loss = tf.constant(0.0)
        
        # 词嵌入层
        with tf.name_scope("embedding"):
            ## 利用预训练的词向量，设置trainable=True，表示不冻结可以训练
            self.W = tf.Variable(tf.cast(self.wordEmbedding, dtype=tf.float32, name="word2Vec"), 
                                 name="W", trainable=False)
            ## 利用词嵌入矩阵将输入的数据中的词转换成词向量，输出为 [batch, seq_len, embed_size]
            self.embeddedWords = tf.nn.embedding_lookup(self.W, self.inputX)
            ## 卷积的输入形状是[batch, heigth, width, channel]，所以需要扩维
            self.embeddedWordsExpand = tf.expand_dims(self.embeddedWords, -1)
            
        # 创建卷积层和池化层
        pooledOutputs = []
        ## 根据自己定义的不同的filter_size，将输出进行融合
        for i, filter_size, in enumerate(self.config['filterSizes']):
            ## 卷积层，卷积核尺寸为 filter_size * embeded_size
            filter_shape = [filter_size, self.config['embeddingSize'], 1, self.config['numFilters']]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[self.config['numFilters']]), name="b")
            conv = tf.nn.conv2d(self.embeddedWordsExpand, W, strides=[1,1,1,1],
                               padding="VALID", name="conv")
            ## 利用relu进行非线性映射
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            
            ## 池化层，进行最大池化之后得到一个值
            ## 其中ksize的形状表示 [batch, height, width, channels]
            pooled = tf.nn.max_pool(h, ksize=[1, self.config['sequenceLength']-filter_size+1, 1, 1],
                                   strides=[1,1,1,1], padding="VALID", name="pool")
            pooledOutputs.append(pooled)
            
        # 得到CNN网络的输出长度
        numFiltersTotal = self.config['numFilters'] * len(self.config['filterSizes'])
        ## 池化之后维度为[batch, 1, 1, channels]，按照最后一维进行concat
        self.hPool = tf.concat(pooledOutputs, 3)
        ## 摊平成二维数据输入到全连接层
        self.hPoolFlat = tf.reshape(self.hPool, [-1, numFiltersTotal])
        
        # Dropout层
        with tf.name_scope("dropout"):
            self.hDrop = tf.nn.dropout(self.hPoolFlat, self.dropoutProb)
        
        # 全连接层
        with tf.name_scope("output"):

            self.logits = tf.layers.dense(self.hDrop, self.config['numClasses'], name="dense",
                                         kernel_initializer=tf.truncated_normal_initializer(stddev=0.1, seed=2019),
                                         bias_initializer=tf.constant_initializer(0.1))
            ## 获取该层的权重
            with tf.variable_scope("dense", reuse=True):
                outputW = tf.get_variable("kernel")
            l2Loss += tf.nn.l2_loss(outputW)
            
            if self.config['numClasses'] == 1: 
                self.predictions = tf.cast(tf.greater_equal(self.logits, 0.0), tf.int32, name="predictions")
            elif self.config['numClasses'] > 1: 
                self.predictions = tf.argmax(self.logits, axis=-1, name="predictions")
            
        # 计算二元交叉熵损失
        with tf.name_scope("loss"):
            if self.config['numClasses'] == 1: 
                losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
                                                                labels=tf.cast(tf.reshape(self.inputY, [-1, 1]),
                                                                              dtype=tf.float32))
            elif self.config['numClasses'] > 1: 
                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits,
                                                                       labels=self.inputY)
            
            self.loss = tf.reduce_mean(losses) + self.config["l2RegLambda"] * l2Loss           
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self.train_step = tf.train.AdamOptimizer(
                    self.config['learningRate']).minimize(self.loss, global_step=self.global_step_tensor)
            

    def init_saver(self):
        '''
        初始化用于保存模型的对象
        '''
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

## 定义训练的类

In [10]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger, labels):
        '''
        这里的data要求是元组的形式，data[0]表示train对象，data[1]表示eval对象
        '''
        super(Trainer, self).__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
        self.labels = labels
        #print("初始化结束...")
    
    def train_epoch(self):
        num_iter_per_epoch = self.train.length // self.config["batch_size"]

        for _ in tqdm(range(num_iter_per_epoch)):
            loss, metrics, step = self.train_step()
            train_acc = metrics['accuracy']
            train_f_score = metrics['f_beta']
            
            ## 将训练过程中的损失写入
            summaries_dict = {"loss": loss, 
                              "acc": np.array(train_acc), 
                              "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer="train", scope="train_summary", summaries_dict=summaries_dict)
            
            if step % self.config['evaluateEvery'] == 0: 
                print("Train —— Step: {} | Loss: {} | Acc: {} | F1_Score: {}".format(
                        step, loss, train_acc, train_f_score))
                ## 对测试集进行评估
                print("\nEvaluation: \n")
                eval_losses = []
                eval_precs = []
                eval_recalls = []
                for batchEval in self.eval.iter_all(self.config["batch_size"]):
                    loss, metrics = self.eval_step(batchEval[0], batchEval[1])
                    eval_losses.append(loss)
                    eval_precs.append(metrics['precision'])
                    eval_recalls.append(metrics["recall"])
                loss_mean = np.round(np.mean(eval_losses), 5)
                prec_mean = np.round(np.mean(eval_precs), 5)
                recall_mean = np.round(np.mean(eval_recalls),5)
                time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S %p")
                
                print("{} | Loss: {} | Precision: {} | Recall: {}".format(time_str,
                                                                     loss_mean,
                                                                     prec_mean, recall_mean))
                summaries_dict = {"loss": np.array(loss_mean), 
                                  "precision": np.array(prec_mean), 
                                  "recall": np.array(recall_mean)}
                self.logger.summarize(step, summarizer="test", scope="test_summary", summaries_dict=summaries_dict)
            
            if step % self.config["checkpointEvery"] == 0: 
                self.model.save(self.sess)
        
    
    def train_step(self):
        batch_x, batch_y = next(self.train.next_batch(self.config["batch_size"]))
        feed_dict = {self.model.inputX: batch_x, self.model.inputY: batch_y,
                    self.model.dropoutProb: self.config['dropoutProb']}
        
        _, loss, predicitons, step = self.sess.run([self.model.train_step, self.model.loss, 
                                                   self.model.predictions, self.model.global_step_tensor],
                                                  feed_dict=feed_dict)
        
        getMetric = Metric(predicitons, batch_y, labels=self.labels)
        if self.config['numClasses'] == 1: 
            metrics = getMetric.get_binary_metrics()
        elif self.config['numClasses'] > 1: 
            metrics = getMetric.get_multi_metrics()
        
        
        return loss, metrics, step
    
    def eval_step(self, batch_x, batch_y):
        '''
        使用验证集进行测试
        '''
        feed_dict = {self.model.inputX: batch_x, self.model.inputY: batch_y,
                     self.model.dropoutProb: 1.0}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                          feed_dict=feed_dict)
        
        getMetric = Metric(predictions, batch_y, labels=self.labels)
        if self.config['numClasses'] == 1: 
            metrics = getMetric.get_binary_metrics()
        elif self.config['numClasses'] > 1: 
            metrics = getMetric.get_multi_metrics()
        
        return loss, metrics
    

In [11]:
def create_dirs(dirs):
    try:
        for dir_ in dirs: 
            if not os.path.exists(dir_):
                os.makedirs(dir_)
        return 0 
    except Exception as e: 
        print("Creating directories error: {}".format(e))
        exit(-1)

In [12]:
def main():
    # 实例化配置参数对象
    ## 指定训练数据的文件名
    path = "../data/imdb/labeldTrain.csv"
    config = Config(path)
    
    
    create_dirs([config["summary_dir"], config["checkpoint_dir"]])
    
    data = Dataset(config)
    data.dataGen("../data/imdb", prefix="imdb")
    
    train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews, data.evalLabels
    wordEmbedding, labels = data.wordEmbedding, data.labelList
    
    train_data = DataGenerator(train_X, train_y)
    eval_data = DataGenerator(eval_X, eval_y)
    pack_data = [train_data, eval_data]
    
    tf.reset_default_graph()
    ## 设置计算图的配置
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth=True
    session_conf.gpu_options.per_process_gpu_memory_fraction=0.9  # 配置GPU占用率
    
    sess = tf.Session(config=session_conf)
    
    ## 创建一个实例
    model = TextCNN(config, wordEmbedding)
    
    logger = Logger(sess, config)
    
    trainer = Trainer(sess, model, pack_data, config, logger, labels)
    
    
    trainer.train_all()

In [13]:
main()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.

当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 100 | Loss: 0.9355392456054688 | Acc: 0.65625 | F1_Score: 0.60714

Evaluation: 

2019-08-01 22:30:35 PM | Loss: 0.5347999930381775 | Precision: 0.88887 | Recall: 0.50664
Saving model...
Model saved
Train —— Step: 200 | Loss: 0.5109789371490479 | Acc: 0.71875 | F1_Score: 0.76316

Evaluation: 

2019-08-01 22:30:39 PM | Loss: 0.3994700014591217 | Precision: 0.7847 | Recall: 0.89907
Saving model...
Model saved
Train —— Step: 300 | Loss: 0.3632057309150696 | Acc: 0.82812 | F1_Score: 0.81356

Evaluation: 

2019-08-01 22:30:43 PM | Loss: 0.36473000049591064 | Precision: 0.87964 | Recall: 0.79756
Saving model...
Model saved


当前正处于第2次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 400 | Loss: 0.3466982841491699 | Acc: 0.85938 | F1_Score: 0.85714

Evaluation: 

2019-08-01 22:30:46 PM | Loss: 0.3438900113105774 | Precision: 0.88986 | Recall: 0.81893
Saving model...
Model saved
Train —— Step: 500 | Loss: 0.459483802318573 | Acc: 0.76562 | F1_Score: 0.73684

Evaluation: 

2019-08-01 22:30:50 PM | Loss: 0.327129989862442 | Precision: 0.88673 | Recall: 0.84108
Saving model...
Model saved
Train —— Step: 600 | Loss: 0.27016669511795044 | Acc: 0.84375 | F1_Score: 0.83871

Evaluation: 

2019-08-01 22:30:53 PM | Loss: 0.31435999274253845 | Precision: 0.87966 | Recall: 0.86597
Saving model...
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Model saved


当前正处于第3次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 700 | Loss: 0.39052173495292664 | Acc: 0.84375 | F1_Score: 0.85294

Evaluation: 

2019-08-01 22:30:57 PM | Loss: 0.31248000264167786 | Precision: 0.84204 | Recall: 0.9132
Saving model...
Model saved
Train —— Step: 800 | Loss: 0.2825438380241394 | Acc: 0.90625 | F1_Score: 0.90909

Evaluation: 

2019-08-01 22:31:00 PM | Loss: 0.30691999197006226 | Precision: 0.88999 | Recall: 0.85598
Saving model...
Model saved
Train —— Step: 900 | Loss: 0.2377379834651947 | Acc: 0.90625 | F1_Score: 0.875

Evaluation: 

2019-08-01 22:31:04 PM | Loss: 0.31589001417160034 | Precision: 0.90855 | Recall: 0.82587
Saving model...
Model saved


当前正处于第4次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1000 | Loss: 0.17894205451011658 | Acc: 0.98438 | F1_Score: 0.98666

Evaluation: 

2019-08-01 22:31:07 PM | Loss: 0.2924500107765198 | Precision: 0.864 | Recall: 0.90074
Saving model...
Model saved
Train —— Step: 1100 | Loss: 0.16951054334640503 | Acc: 0.95312 | F1_Score: 0.95774

Evaluation: 

2019-08-01 22:31:10 PM | Loss: 0.2902800142765045 | Precision: 0.88512 | Recall: 0.8739
Saving model...
Model saved
Train —— Step: 1200 | Loss: 0.1722017526626587 | Acc: 0.9375 | F1_Score: 0.94118

Evaluation: 

2019-08-01 22:31:14 PM | Loss: 0.29203999042510986 | Precision: 0.85152 | Recall: 0.92189
Saving model...
Model saved


当前正处于第5次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1300 | Loss: 0.29393380880355835 | Acc: 0.85938 | F1_Score: 0.85246

Evaluation: 

2019-08-01 22:31:17 PM | Loss: 0.28633999824523926 | Precision: 0.86488 | Recall: 0.9103
Saving model...
Model saved
Train —— Step: 1400 | Loss: 0.172616109251976 | Acc: 0.9375 | F1_Score: 0.94118

Evaluation: 

2019-08-01 22:31:21 PM | Loss: 0.2852500081062317 | Precision: 0.89341 | Recall: 0.87313
Saving model...
Model saved
Train —— Step: 1500 | Loss: 0.1454441249370575 | Acc: 0.95312 | F1_Score: 0.95082

Evaluation: 

2019-08-01 22:31:25 PM | Loss: 0.2804200053215027 | Precision: 0.87405 | Recall: 0.89898
Saving model...
Model saved


当前正处于第6次迭代


HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Train —— Step: 1600 | Loss: 0.10753684490919113 | Acc: 0.96875 | F1_Score: 0.96875

Evaluation: 

2019-08-01 22:31:29 PM | Loss: 0.28185999393463135 | Precision: 0.86388 | Recall: 0.90984
Saving model...
Model saved
Train —— Step: 1700 | Loss: 0.13584646582603455 | Acc: 0.96875 | F1_Score: 0.97059

Evaluation: 

2019-08-01 22:31:33 PM | Loss: 0.2865000069141388 | Precision: 0.89903 | Recall: 0.8614
Saving model...
Model saved
Train —— Step: 1800 | Loss: 0.20678932964801788 | Acc: 0.89062 | F1_Score: 0.88524

Evaluation: 

2019-08-01 22:31:37 PM | Loss: 0.30316999554634094 | Precision: 0.83768 | Recall: 0.93436
Saving model...
Model saved



- 不冻结word-embedding的情况下，验证集最优结果为 —— P: 0.89297, R: 0.88057 

- 冻结word-embedding的情况下，验证集最优结果为  —— P: 0.89903, R: 0.8614

In [18]:
def predict(x, config):
    with open("../data/wordJson/imdbword2idx.json", "r", encoding="utf-8") as f: 
        word2idx = json.load(f)
    with open("../data/wordJson/imdblabel2idx.json", "r", encoding="utf-8") as f: 
        label2idx = json.load(f)
        
    idx2label  = {value:key for key, value in label2idx.items()}
    
    xIds = [word2idx.get(item, word2idx["UNK"]) for item in x.split(" ")]
    if len(xIds) >= config["sequenceLength"]:
        xIds = xIds[:config["sequenceLength"]]
    else:
        xIds = xIds + [word2idx["PAD"]] * (config["sequenceLength"] - len(xIds))
    
    g = tf.Graph()
    
    with g.as_default():
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, 
                                     gpu_options=gpu_options)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            checkpoint_file = tf.train.latest_checkpoint("../model/textCNN/checkpoint/")
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            
            # 获取需要未给模型的参数
            inputX = g.get_operation_by_name("inputX").outputs[0]
            dropoutProb = g.get_operation_by_name("dropoutProb").outputs[0]
            
            # 获取输出的结果
            predictions = g.get_tensor_by_name("output/predictions:0")
            pred = sess.run(predictions, feed_dict={inputX: [xIds], dropoutProb: 1.0})[0]
            
            print(pred)

In [19]:
x = "this is bad !"

config = Config()

predict(x, config)

INFO:tensorflow:Restoring parameters from ../model/textCNN/checkpoint/my_model-1800
[0]
