In [1]:
import pandas as pd 
import numpy as np 

import json 

import warnings

warnings.filterwarnings("ignore")

In [2]:
from utils import *  
from collections import Counter
from tqdm.autonotebook import tqdm

In [3]:
import tensorflow as tf 

# 对数据集进行处理

In [4]:
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk import tokenize
import os 
import re 

In [5]:
# 添加需要去除的标点符号集，问号和感叹号除外
puncts = [',', '.', '"', ':', ')', '(', '-', '?', '!', '|', ';', "'", '$', '&', '/', 
          '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', 
          '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', 
          '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', 
          '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥',
          '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', 
          '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
          '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', 
          '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

# 定义一些常见的缩写
## 这里"\g<1>"表示前面匹配模式group(1)的值，group(0)表示全部，group(1)表示第一个括号的匹配值
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'),
                        (r'isn\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), 
                        (r'(\w+)n\'t', '\g<1> not'),(r'(\w+)\'ve', '\g<1> have'), 
                        (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), 
                        (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'),
                        (r'dont', 'do not'), (r'wont', 'will not') ]

def clean_text(text):
    stoplists = stopwords.words("english")
    wnl = WordNetLemmatizer()
    
    # 去除对情感分类没有用的数字
    text = re.sub("[0-9]+", "", text)
    # 对重复出现的标点进行天魂
    text = re.sub(r"(\!)\1+", "multiExclamation", text)
    text = re.sub(r"(\?)\1+", "multiQuestion", text)
    text = re.sub(r"(\.)\1+", "multiStop", text)
    
    # 对缩写进行替换
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    
    # 在标点前面加上空格
    for punct in puncts:
        text = text.replace(punct, f" {punct} ")
    
    #print(text)
    # 对文本进行分词
    text_split = tokenize.word_tokenize(text)
    text = [word for word in text_split if word not in stoplists]
    text = [wnl.lemmatize(word) for word in text]
    
    return " ".join(text)

In [6]:
## 获取用于训练的数据集
def preprocess(path="../data/yelps/review.csv"):
    data = pd.read_csv(path)
    data = data[["stars", "text"]]
    data["emotion"] = data["stars"].apply(lambda x: int(x>3.0))
    data = data[["text", "emotion"]]
    data.rename(columns={"text": "review"}, inplace=True)
    # 取前10w的数据作为训练集
    data = data[:100000]
    
    print("正在处理文本...")
    data["review"] = data["review"].apply(clean_text)
    
    print("文本处理结束!")
    data.to_csv(os.path.join(os.path.dirname(path), "yelps.csv"), index=False)

In [37]:
preprocess()

正在处理文本...
文本处理结束!


In [29]:
data = pd.read_csv("../data/yelps/yelps.csv")

In [30]:
data.head()

Unnamed: 0,review,emotion
0,Total bill horrible service ? Over $ Gs . Thes...,0
1,I * adore * Travis Hard Rock new Kelly Cardena...,1
2,"I say office really together , organized frien...",1
3,"Went lunch . Steak sandwich delicious , Caesar...",1
4,Today second three session I paid . Although f...,0


# 定义文本数据处理类

In [5]:
from nltk import sent_tokenize, word_tokenize

In [6]:
import gensim

In [9]:
class Dataset(object):
    def __init__(self, config):
        self.config = config 
        self._dataSource = config["dataSource"]  # 数据源
        self._stopWordSource = config["stopWordSource"]  # 停止词目录
        self._sentenceLength = config["sentenceLength"]   # 表示语句的长度，每个语句多少个单词
        self._docLength = config["docLength"]    # 表示文章的长度，每篇文章多少个句子
        
        self._embeddingSize = config["embeddingSize"]
        self._batchSize = config["batch_size"]
        self._trainRate = config["train_size"]
        
        self._stopWordDict = {}
        self.trainReviews = []   # 存储训练集
        self.trainLabels = []    # 存储训练集标签
        self.evalReviews = []    # 存储验证集
        self.evalLabels = []     # 存储验证集标签
        
        self.wordEmbedding = None  # 保存embedding的对照表
        self.labelList = []      # 保存有多少不同的标签值
        
    def _readData(self, filePath):
        '''
        从csv文件中读取数据集
        '''
        df = pd.read_csv(filePath)
        if self.config['numClasses'] == 1:
            if "sentiment" in df.columns:
                labels = df["sentiment"].tolist()
            if "emotion" in df.columns:
                labels = df["emotion"].tolist()
        elif self.config['numClasses'] > 1:
            labels = df["rate"].tolist()
        
        ## 获取所有的评论文本
        data_x = []
        reviews = df["review"].tolist()
        ## 对于每一个评论，都按照文章处理
        for text in reviews:
            doc = []
            ## 按照语句进行划分得到不同的语句
            sents = sent_tokenize(text)
            for i, sent in enumerate(sents):
                ## 对每个句子进行分词，并保存
                doc.append(word_tokenize(sent))
            ## 将每篇文章的分词结果保存
            data_x.append(doc)
            
        return data_x, labels
    
    def _readStopWord(self, stopWordPath):
        '''
        读取停用词
        '''
        with open(stopWordPath, "r") as f: 
            stopWords = f.read()
            stopWordList = stopWords.splitlines()
            ## 转换成字典的形式，使用hash查找效率更高
            self._stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
    
        
    def _getWordEmbedding(self, words, path):
        '''
        按照数据集中的单词取出训练好的词向量 
        '''
        wordVec = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(path, "wordvector.bin"),
                                                                 binary=True)
        vocab = []
        wordEmbedding = []
        
        # 添加"pad"和"unk"
        vocab.append("PAD")
        vocab.append("UNK")
        
        wordEmbedding.append(np.zeros(self._embeddingSize)) # 用全0向量表示"PAD"
        wordEmbedding.append(np.random.randn(self._embeddingSize))  # 用随机向量表示"UNK"
        
        for word in words:
            if word != "PAD" and word != "UNK":
                try:
                    vector = wordVec.wv[word]
                    vocab.append(word)
                    wordEmbedding.append(vector)
                except:
                    pass

        return vocab, np.array(wordEmbedding)
    
    def _genVocabulary(self, reviews, labels, path, prefix=""):
        '''
        生成向量以及词汇-索引的字典
        '''
        save_path = "../data/wordJson/"
        target_word_dir = os.path.join(save_path, prefix+"_word2idx.json")
        target_label_dir = os.path.join(save_path, prefix+"_label2idx.json")
        target_freq_dir = os.path.join(save_path, prefix+"_wordfreq.json")
        
        ## 对于每一篇文章中，每一句话的每一个词
        allWords = [word for doc in reviews for sent in doc for word in sent]
        ## 去除停用词
        subWords = [word for word in allWords if word not in self._stopWordDict]
        ## 统计词频
        wordCount = Counter(subWords)
        ## 按照词频进行排序，去除低频词
        sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortWordCount if item[1] >= 5]

        vocab, wordEmbedding = self._getWordEmbedding(words, path)
        self.wordEmbedding = wordEmbedding            
        word2idx = dict(zip(vocab, range(len(vocab))))

        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))

        ## 将词汇表-索引表保存为json数据
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with open(target_word_dir, "w", encoding="utf8") as f:
            json.dump(word2idx, f)

        with open(target_label_dir, "w", encoding="utf8") as f:
            json.dump(label2idx, f)
        ## 保存词频信息
        with open(target_freq_dir, "w", encoding="utf8") as f: 
            json.dump(wordCount, f)
                
        return word2idx, label2idx
    
    def _labelToIndex(self, labels, label2idx):
        '''
        将标签转换为索引表示
        '''
        try:
            labelIds = [label2idx[label] for label in labels]
        except:
            labelIds = [label2idx[str(label)] for label in labels]
        return labelIds
    
    def _wordToIndex(self, reviews, word2idx):
        '''
        将词转换为索引表示
        '''
        reviewIds = [[[word2idx.get(item, word2idx["UNK"]) for item in sent] for sent in doc] for doc in reviews]
        return reviewIds
    
    def _genTrainEvalData(self, x, y, word2idx, rate):
        '''
        生成训练集和验证集
        '''
        for i, text in enumerate(x):
            doc = []
            ## 让每个句子的单词数相同
            for sent in text:
                if len(sent) >= self._sentenceLength:
                    doc.append(sent[:self._sentenceLength])
                else:
                    doc.append(sent + [word2idx["PAD"]] * (self._sentenceLength - len(sent)))
            ## 让每篇文章的句子数相同
            if len(doc) >= self._docLength:
                doc = doc[:self._docLength]
            else:
                doc.extend([[word2idx["PAD"]] * self._sentenceLength] * (self._docLength - len(doc)))

            if i == 0:
                reviews = np.array(doc).reshape(1, self._docLength, self._sentenceLength)
            else:
                reviews = np.concatenate([reviews, np.array(doc).reshape(1, self._docLength, self._sentenceLength)],
                                        axis=0)
        
        
        trainIndex = int(len(x)*rate)
        y = np.array(y).reshape(-1, 1)
        
        trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
        trainLabels = np.asarray(y[:trainIndex], dtype="float32")
        
        evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
        evalLabels = np.asarray(y[trainIndex:], dtype="float32")
        
        return trainReviews, trainLabels, evalReviews, evalLabels
    
    def dataGen(self, path, prefix=""):
        '''
        path: 表示wordvector所在的文件夹
        prefix: 生成单词到索引的文件的前缀
        '''
        # 初始化停用词
        self._readStopWord(self._stopWordSource)
        # 初始化数据集
        reviews, labels = self._readData(self._dataSource)
        # 初始化词汇-索引映射表和词向量矩阵
        word2idx, label2idx = self._genVocabulary(reviews, labels, path, prefix)
        ## 将标签和句子数值化
        labelIds = self._labelToIndex(labels, label2idx)
        reviewsIds = self._wordToIndex(reviews, word2idx)

        # 初始化训练集和测试集
        trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviewsIds,
                                                                                    labelIds,
                                                                                    word2idx,
                                                                                    self._trainRate)
        self.trainReviews = trainReviews
        self.trainLabels = trainLabels
        self.evalReviews = evalReviews
        self.evalLabels = evalLabels

## 定义基本的配置类

In [188]:
class Config(dict):
    def __init__(self, path=None):
        super().__init__()
        ## 定义训练参数
        self['num_epochs'] = 5 
        self['evaluateEvery'] = 200 
        self['checkpointEvery'] = 200 
        ### 学习率衰减
        self['learningRate'] = 0.05
        self['decay_steps'] = 100    ## 学习率衰减的时间点
        self['decay_rate'] = 0.9     ## 学习率衰减的幅度
        self['grad_clip'] = 5.0  ## 梯度削减
        
        ## 定义模型参数
        self['embeddingSize'] = 200 
        self['hiddenSizes'] = 50  ## GRU隐层神经元数
        self['dropoutProb'] = 0.5 
        self['l2RegLambda'] = 0.001   ## 正则化系数不能过大
        
        ## 定义基础参数
        self['sentenceLength'] = 20   ## 表示每一句话最多的单词数
        self['docLength'] = 8    ## 表示每一篇文档最多的句子数
        self['batch_size'] = 64 
        self['dataSource'] = path  
        self['stopWordSource'] = "../data/english"
        self['numClasses'] = 1 
        self['train_size'] = 0.8  ## 训练集和测试集的比例
        self.threshold = 0.5 
        
        ## 保存模型的参数
        self['checkpoint_dir'] = "../model/HAN/yelps/checkpoint"
        self['summary_dir'] = "../model/HAN/yelps/summary"
        self['max_to_keep'] = 5 

# 定义模型类和训练类

In [119]:
def getSequenceLength(sequences):
    '''
    sequences: RNN的输入，形状是 [batch, max_time, embed_size]
    '''
    # 先取绝对值，保证embedding值全部大于等于0
    abs_sequences = tf.abs(sequences)
    ## 由于padding的向量是全0的，所以最大值也是0
    abs_max_seq = tf.reduce_max(abs_sequences, reduction_indices=2)
    max_seq_sign = tf.sign(abs_max_seq)
    
    # 求和就是真实长度
    real_len = tf.reduce_sum(max_seq_sign, reduction_indices=1)

    return tf.cast(real_len, tf.int32)

## 定义模型类

In [185]:
class HAN(BaseModel):
    def __init__(self, config, wordEmbedding):
        super().__init__(config)
        self.wordEmbedding = wordEmbedding
        self.build_model()
        self.init_saver()
        
        
    def build_model(self):
        # 输入层
        self.inputX = tf.placeholder(tf.int32, [None, self.config['docLength'], self.config['sentenceLength']],
                                    name="inputX")
        self.inputY = tf.placeholder(tf.float32, [None, self.config['numClasses']], name="inputY")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        
        # embedding层
        with tf.name_scope("embedding"):
            self.embeddings = tf.Variable(self.wordEmbedding, dtype=tf.float32, 
                                          name="wordEmbedding", trainable=True)
            ## 结果 [batch, doc_len, sen_len, embed_size]
            self.x0 = tf.nn.embedding_lookup(self.embeddings, self.inputX)
        # 首先对句子级别的单词对应编码和Attention
        ## 输出[batch*doc_len, hidden_size*2]
        self.sen_vec = self.sen2vec(self.x0)
        ## 输出[batch, hidden_size*2]
        self.doc_vec = self.doc2vec(self.sen_vec)
        
        with tf.name_scope("output"):
            logits = tf.layers.dense(self.doc_vec, self.config['numClasses'], activation=None)
            self.predictions = tf.nn.sigmoid(logits)
        
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.inputY, logits=logits)
            self.loss = tf.reduce_mean(losses)
            if self.config['l2RegLambda'] > 0: 
                l2_loss = tf.add_n([tf.nn.l2_loss(cand_var) 
                                    for cand_var in tf.trainable_variables() 
                                    if "bia" not in cand_var.name and "Embedding" not in cand_var.name])
                self.loss += self.config['l2RegLambda'] * l2_loss

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        
        with tf.control_dependencies(update_ops):   
            learning_rate = tf.train.exponential_decay(self.config['learningRate'], self.global_step_tensor,
                                                      self.config['decay_steps'], 
                                                      self.config['decay_rate'], staircase=True)
          
            # 使用梯度削减防止梯度消失或者梯度爆炸
            optimizer = tf.train.AdamOptimizer(learning_rate)
            
        
            grads_and_vars = optimizer.compute_gradients(self.loss)
            for idx, (grad, var) in enumerate(grads_and_vars):
                if grad is not None:
                    grads_and_vars[idx] = (tf.clip_by_norm(grad, self.config['grad_clip']), var)
            
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step_tensor)
        
                      
    #################################################################################################
    
    def sen2vec(self, word_embeded):
        '''
        这里输入word_embeded是[batch, doc_len, sen_len, embed_size]
        首先进行句子级别的处理，这时不关注文章的区别，只关注不同的句子，所以将[batch*doc_len]当做长度　
        这样符合GRU对输入的要求，最终输出的结果将每个句子的所有单词attention到一个句向量中
        '''
        with tf.name_scope("sen2vec"):
            # 形状[batch_size*doc_len, sen_len, embed_size]
            word_embeded = tf.reshape(word_embeded, [-1, self.config['sentenceLength'], self.config['embeddingSize']])
            ## 输出 [batch*doc_len, sen_len, hidden_size*2]
            word_encoder = self.BidirectionalGRUEncoder(word_embeded, name="word_encoder")
            ## 输出 [batch*doc_len, hidden_size*2]
            sen_vec = self.AttentionLayer(word_encoder, name="word_attention")
            
            return sen_vec
    
    def doc2vec(self, sen_vec):
        '''
        和sen2vec的操作类似，最后融合成doc向量
        '''
        with tf.name_scope("doc2vec"):
            sen_vec = tf.reshape(sen_vec, [-1, self.config['docLength'], self.config['hiddenSizes']*2])
            # shape: [batch, sen_len, hidden_size*2]
      
            doc_encoder = self.BidirectionalGRUEncoder(sen_vec, name="doc_encoder")
            
            # shape: [batch, hidden_size*2]
            doc_vec = self.AttentionLayer(doc_encoder, name="doc_vec")
            return doc_vec
    
    
    def BidirectionalGRUEncoder(self, inputs, name):
        '''
        双向GRU编码，将一个句子的所有单词或者一个文档的所有句子进行编码得到一个2*hidden_size的输出向量
        inputs: [batch, max_time, embedding_size]
        outputs: [batch, max_time, 2*hidden_size]
        '''

        with tf.name_scope(name):
            fw_gru_cell = tf.nn.rnn_cell.GRUCell(num_units = self.config['hiddenSizes'])
            bw_gru_cell = tf.nn.rnn_cell.GRUCell(num_units = self.config['hiddenSizes'])
            fw_gru_cell = tf.nn.rnn_cell.DropoutWrapper(fw_gru_cell, output_keep_prob=self.config['dropoutProb'])
            bw_gru_cell = tf.nn.rnn_cell.DropoutWrapper(bw_gru_cell, output_keep_prob=self.config['dropoutProb'])
                    
            ## fw_outputs和bw_outputs的形状都是 [batch_size, max_time, hidden_size]
            (fw_outputs, bw_outputs), (fw_outputs_state, bw_outputs_state) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=fw_gru_cell, cell_bw=bw_gru_cell, inputs=inputs,
                sequence_length=getSequenceLength(inputs), dtype=tf.float32, scope=f"BiGRU_{name}")
            ## 拼接之后的长度为 [batch_size, max_time, hidden_size*2]          
            outputs = tf.concat((fw_outputs, bw_outputs), 2)
            
            return outputs
    
    def AttentionLayer(self, inputs, name):
        '''
        inputs是GRU层的输出
        inputs: [batch, max_time, 2*hidden_size]
        '''
        with tf.name_scope(name):
            # context_weight是上下文的重要性向量，用于区分不同单词/句子/文档的重要程度
            ## 也就是query
            querys = tf.Variable(tf.truncated_normal([self.config['hiddenSizes']*2]), name="context_weight")
            
            # 使用单层MLP对GRU的输出进行编码
            ## 对应key
            ### [batch, max_time, 2*hidden_size]
            keys = tf.layers.dense(inputs, self.config['hiddenSizes']*2, activation=tf.nn.tanh)
            raw_weight = tf.reduce_sum(tf.multiply(keys, querys), axis=2, keepdims=True)
            ## shape: [batch, max_time, 1]
            alpha = tf.nn.softmax(raw_weight, dim=1)
            
            ## 得到结果 [batch, 2*hidden_size]
            atten_output = tf.reduce_sum(tf.multiply(inputs, alpha), axis=1)
            return atten_output
    
    def init_saver(self):
        self.saver = tf.train.Saver(max_to_keep=self.config['max_to_keep'])

## 定义训练类

In [74]:
class DataGenerator:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.length = len(y)
        ## 计算不同类别的比例
        unique = Counter(self.y.ravel())
        self.ratio = [(key, value / self.length) for key, value in unique.items()]
        self.indices = []
        for key, _ in self.ratio:
            index = np.where(y.ravel() == key)
            self.indices.append(index)
        
        
    def next_batch(self, batch_size):
        '''
        生成每一个batch的数据集
        '''
        choose = np.array([])
        for i in range(len(self.indices)):
            idx = np.random.choice(self.indices[i][0], 
                                   max(1, min(len(self.indices[i][0]), int(batch_size*self.ratio[i][1]))))
            choose = np.append(choose, idx)
        choose = np.random.permutation(choose).astype("int64")
        yield self.x[choose], self.y[choose]

    def iter_all(self, batch_size):
        '''
        按照batch迭代所有数据
        '''
        numBatches = self.length // batch_size + 1
        for i in range(numBatches):
            start = i * batch_size
            end = min(start + batch_size, self.length)
            batchX = np.array(self.x[start:end], dtype='int64')
            batchY = np.array(self.y[start:end], dtype="float32")
            yield batchX, batchY


In [147]:
class Trainer(BaseTrain):
    def __init__(self, sess, model, data, config, logger):
        super(Trainer, self).__init__(sess, model, data, config, logger)
        self.train = data[0]
        self.eval = data[1]
    
    def train_epoch(self):
        num_iter_per_epoch = self.train.length // self.config["batch_size"]
        for _ in tqdm(range(num_iter_per_epoch)):
            ## 获取训练结果
            loss, metrics, step = self.train_step()
            train_acc = metrics['accuracy']
            train_f_score = metrics['f_score']
            
            ## 将训练过程的损失写入
            summaries_dict = {"loss": loss,
                             "acc": np.array(train_acc),
                             "f_score": np.array(train_f_score)}
            self.logger.summarize(step, summarizer="train", scope="train_summary",
                                 summaries_dict=summaries_dict)
            if step % self.config['evaluateEvery'] == 0: 
                print("Train ——　Step: {} | Loss: {} | Acc: {} | F1_Score: {}".format(
                    step, loss, train_acc, train_f_score))
                ## 对测试集进行评估
                eval_losses = []
                eval_predictions = []
                eval_true = []
                for batchEval in self.eval.iter_all(self.config["batch_size"]):
                    loss, predictions = self.eval_step(batchEval[0], batchEval[1])
                    eval_losses.append(loss)
                    eval_predictions.extend(predictions)
                    eval_true.extend(batchEval[-1])
                getMetric = Metric( np.array(eval_predictions), np.array(eval_true), self.config)
                metrics = getMetric.get_metrics()
                eval_prec = np.round(metrics['precision'], 5)
                eval_recall = np.round(metrics['recall'], 5)
                loss_mean = np.round(np.mean(eval_losses), 5)
                
                print("Evaluation —— Loss: {} | Precision: {} | Recall: {}".format(
                    loss_mean, eval_prec, eval_recall))
                summaries_dict = {"loss": np.array(loss_mean),
                                 "precision": np.array(eval_prec),
                                 "recall": np.array(eval_recall)}
                self.logger.summarize(step, summarizer="test", scope="test_summary",
                                     summaries_dict=summaries_dict)
            
            if step % self.config['checkpointEvery'] == 0:
                self.model.save(self.sess)
                    
    
    def train_step(self):
        batch_x, batch_y = next(self.train.next_batch(self.config["batch_size"]))
        feed_dict = {self.model.inputX: batch_x,
                    self.model.inputY: batch_y, 
                    self.model.dropout_keep_prob: self.config['dropoutProb']}
        _, loss, predictions, step = self.sess.run([self.model.train_op,
                                                   self.model.loss,
                                                   self.model.predictions, 
                                                   self.model.global_step_tensor],
                                                  feed_dict=feed_dict)
        getMetric = Metric(predictions, batch_y, self.config)
        metrics = getMetric.get_metrics()
        return loss, metrics, step 
    
    def eval_step(self, batch_x, batch_y):
        feed_dict = {self.model.inputX: batch_x,
                    self.model.inputY:batch_y,
                    self.model.dropout_keep_prob: 1.0}
        loss, predictions = self.sess.run([self.model.loss, self.model.predictions],
                                         feed_dict=feed_dict)

        return loss, predictions

# 使用yelps数据集训练

In [189]:
import joblib

In [121]:
path = "../data/yelps/yelps.csv"
config = Config(path)

create_dirs([config['summary_dir'], config['checkpoint_dir']])

data = Dataset(config)
data.dataGen("../data/yelps/", prefix="yelps")

train_X, train_y, eval_X, eval_y = data.trainReviews, data.trainLabels, data.evalReviews, data.evalLabels
wordEmbedding, labels = data.wordEmbedding, data.labelList

In [193]:
train_data = DataGenerator(train_X, train_y)
eval_data = DataGenerator(eval_X, eval_y)
pack_data = [train_data, eval_data]

In [195]:
config['num_epochs'] = 5

create_dirs([config['summary_dir'], config['checkpoint_dir']])

tf.reset_default_graph()
## 设置计算图的配置
session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
session_conf.gpu_options.allow_growth = True
session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 

sess = tf.Session(config=session_conf)

model = HAN(config, wordEmbedding)

logger = Logger(sess, config)

trainer = Trainer(sess, model, pack_data, config, logger)
trainer.train_all()


当前正处于第1次迭代


HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train ——　Step: 200 | Loss: 0.6856454610824585 | Acc: 0.84127 | F1_Score: 0.87179
Evaluation —— Loss: 0.6800900101661682 | Precision: 0.88695 | Recall: 0.83744
Saving model...
Model saved
Train ——　Step: 400 | Loss: 0.4370688498020172 | Acc: 0.88889 | F1_Score: 0.91954
Evaluation —— Loss: 0.5730000138282776 | Precision: 0.92467 | Recall: 0.77149
Saving model...
Model saved
Train ——　Step: 600 | Loss: 0.5032904744148254 | Acc: 0.8254 | F1_Score: 0.88172
Evaluation —— Loss: 0.44317999482154846 | Precision: 0.88223 | Recall: 0.92129
Saving model...
Model saved
Train ——　Step: 800 | Loss: 0.47491201758384705 | Acc: 0.90476 | F1_Score: 0.93182
Evaluation —— Loss: 0.4517799913883209 | Precision: 0.8657 | Recall: 0.94221
Saving model...
Model saved
Train ——　Step: 1000 | Loss: 0.28165656328201294 | Acc: 0.90476 | F1_Score: 0.92857
Evaluation —— Loss: 0.42076000571250916 | Precision: 0.87313 | Recall: 0.93269
Saving model...
Model saved
Train ——　Step: 1200 | Loss: 0.4833293557167053 | Acc: 0.79365 

HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train ——　Step: 1400 | Loss: 0.31217390298843384 | Acc: 0.85714 | F1_Score: 0.89157
Evaluation —— Loss: 0.4099099934101105 | Precision: 0.92464 | Recall: 0.84159
Saving model...
Model saved
Train ——　Step: 1600 | Loss: 0.37229737639427185 | Acc: 0.85714 | F1_Score: 0.89412
Evaluation —— Loss: 0.36840999126434326 | Precision: 0.90174 | Recall: 0.90399
Saving model...
Model saved
Train ——　Step: 1800 | Loss: 0.3807463049888611 | Acc: 0.85714 | F1_Score: 0.89412
Evaluation —— Loss: 0.38005998730659485 | Precision: 0.86114 | Recall: 0.95007
Saving model...
Model saved
Train ——　Step: 2000 | Loss: 0.3118847608566284 | Acc: 0.88889 | F1_Score: 0.91358
Evaluation —— Loss: 0.3496200144290924 | Precision: 0.89633 | Recall: 0.91373
Saving model...
Model saved
Train ——　Step: 2200 | Loss: 0.2534031867980957 | Acc: 0.88889 | F1_Score: 0.91765
Evaluation —— Loss: 0.3443799912929535 | Precision: 0.90405 | Recall: 0.90678
Saving model...
Model saved
Train ——　Step: 2400 | Loss: 0.2383711338043213 | Acc: 0.

HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train ——　Step: 2600 | Loss: 0.2147609293460846 | Acc: 0.96825 | F1_Score: 0.97619
Evaluation —— Loss: 0.3557400107383728 | Precision: 0.91099 | Recall: 0.88525
Saving model...
Model saved
Train ——　Step: 2800 | Loss: 0.15098153054714203 | Acc: 0.96825 | F1_Score: 0.97619
Evaluation —— Loss: 0.3401699960231781 | Precision: 0.91177 | Recall: 0.89462
Saving model...
Model saved
Train ——　Step: 3000 | Loss: 0.17149394750595093 | Acc: 0.93651 | F1_Score: 0.95
Evaluation —— Loss: 0.34046998620033264 | Precision: 0.88844 | Recall: 0.93126
Saving model...
Model saved
Train ——　Step: 3200 | Loss: 0.17516827583312988 | Acc: 0.95238 | F1_Score: 0.96386
Evaluation —— Loss: 0.35416001081466675 | Precision: 0.87971 | Recall: 0.94191
Saving model...
Model saved
Train ——　Step: 3400 | Loss: 0.23619872331619263 | Acc: 0.93651 | F1_Score: 0.95349
Evaluation —— Loss: 0.34321001172065735 | Precision: 0.89132 | Recall: 0.9237
Saving model...
Model saved
Train ——　Step: 3600 | Loss: 0.16236452758312225 | Acc: 0.

HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train ——　Step: 3800 | Loss: 0.14993001520633698 | Acc: 0.93651 | F1_Score: 0.95349
Evaluation —— Loss: 0.3515099883079529 | Precision: 0.89905 | Recall: 0.91698
Saving model...
Model saved
Train ——　Step: 4000 | Loss: 0.16456782817840576 | Acc: 0.93651 | F1_Score: 0.95349
Evaluation —— Loss: 0.34007999300956726 | Precision: 0.90189 | Recall: 0.9132
Saving model...
Model saved
Train ——　Step: 4200 | Loss: 0.1182461678981781 | Acc: 0.96825 | F1_Score: 0.97619
Evaluation —— Loss: 0.3443700075149536 | Precision: 0.88816 | Recall: 0.92741
Saving model...
Model saved
Train ——　Step: 4400 | Loss: 0.20902268588542938 | Acc: 0.92063 | F1_Score: 0.94118
Evaluation —— Loss: 0.34338000416755676 | Precision: 0.88695 | Recall: 0.93164
Saving model...
Model saved
Train ——　Step: 4600 | Loss: 0.10863339155912399 | Acc: 0.96825 | F1_Score: 0.97619
Evaluation —— Loss: 0.3399200141429901 | Precision: 0.89927 | Recall: 0.91517
Saving model...
Model saved
Train ——　Step: 4800 | Loss: 0.13014714419841766 | Acc: 

HBox(children=(IntProgress(value=0, max=1250), HTML(value='')))

Train ——　Step: 5200 | Loss: 0.31845641136169434 | Acc: 0.87302 | F1_Score: 0.90909
Evaluation —— Loss: 0.34318000078201294 | Precision: 0.88936 | Recall: 0.92907
Saving model...
Model saved
Train ——　Step: 5400 | Loss: 0.21163500845432281 | Acc: 0.92063 | F1_Score: 0.94253
Evaluation —— Loss: 0.34981000423431396 | Precision: 0.89963 | Recall: 0.91404
Saving model...
Model saved
Train ——　Step: 5600 | Loss: 0.17650136351585388 | Acc: 0.93651 | F1_Score: 0.95238
Evaluation —— Loss: 0.35097000002861023 | Precision: 0.90397 | Recall: 0.9052
Saving model...
Model saved
Train ——　Step: 5800 | Loss: 0.1434902399778366 | Acc: 0.96825 | F1_Score: 0.97674
Evaluation —— Loss: 0.3462899923324585 | Precision: 0.89733 | Recall: 0.91638
Saving model...
Model saved
Train ——　Step: 6000 | Loss: 0.27254602313041687 | Acc: 0.88889 | F1_Score: 0.91566
Evaluation —— Loss: 0.3505699932575226 | Precision: 0.89652 | Recall: 0.91562
Saving model...
Model saved
Train ——　Step: 6200 | Loss: 0.17915242910385132 | Acc:

验证集最终结果为　——　Precision: 0.89713, Recall: 0.91774

在这个实验过程中花费了比较长的时间，做一下记录：开始模型的梯度总是会变成全０，导致输出一直有问题，本来以为是模型的问题（也确实有几个敲错的地方），更正了敲错的地方之后仍然存在问题。多次调试之后发现了原因：

- **L2正则化系数过大，一般设置不要超过$10^{-3}$级别，当然也不要太小**；

- **学习率衰减的步数太大，由于本次使用的数据集比较小，所以衰减步数太大导致收敛太慢**。