In [1]:
import tensorflow as tf 
import tensorflow_hub as hub 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from datetime import datetime

In [2]:
import bert
from bert import run_classifier
from bert import tokenization
from bert import optimization

W0905 10:55:22.463685 139948305139456 deprecation_wrapper.py:119] From /home/chen/anaconda3/lib/python3.7/site-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [17]:
import os 

# 1. 使用IMDB数据集

## 1.1 数据预处理

In [4]:
data = pd.read_csv("../data/imdb/labeldTrain.csv")

In [5]:
# 划分训练即和测试集
#train_X, eval_X, train_y, eval_y = train_test_split(data['review'].values, data['sentiment'].values, shuffle=True, stratify=data['sentiment'].values)
train = data[:int(data.shape[0]*0.8)]
test = data[int(data.shape[0]*0.8):]

&emsp;需要将数据转换成BERT可以理解的形式，主要分为两步。

第一步，构造使用BERT的构造器构造`InputExample`的实例

- `text_a`：表示要分类的文本
- `text_b`：在计算两个语句关系的时候使用，比如翻译，问答等；所以这里只需要设置`text_b=None`
- `label`：样本的标签

In [6]:
DATA_COLUMN = "review"
LABEL_COLUMN = "sentiment"

In [7]:
# 这里｀guid｀表示用于记录的全局唯一ID，本例中没有作用
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                            text_a=x[DATA_COLUMN],
                                                                            text_b=None, 
                                                                            label=x[LABEL_COLUMN]),
                                 axis=1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                          text_a=x[DATA_COLUMN],
                                                                          text_b=None,
                                                                          label=x[LABEL_COLUMN]),
                               axis=1)

第二部，对数据进行预处理，可以用于BERT的训练。

- 将所有单词小写
- 分词
- 将words分割成**wordpieces**
- 将单词转换为词表中的索引
- 增加`CLS`和`SEP`字符
- 对输入增加`pos-embedding`和`segment-embedding`

In [8]:
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                                 tokenization_info["do_lower_case"]])
    
    return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

W0905 11:00:34.705354 139948305139456 deprecation_wrapper.py:119] From /home/chen/anaconda3/lib/python3.7/site-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



使用上面定义好的分词器，需要用`run_classifier.convert_examples_to_features`将数据转换成BERT理解的形式

In [9]:
label_list = data['sentiment'].unique().tolist()

In [11]:
# 定义语句的最大长度
MAX_SEQ_LENGTH = 128 
# 将训练集和测试集转换成BERT理解的特征
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list,
                                                                 MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list,
                                                                MAX_SEQ_LENGTH, tokenizer)

W0905 11:05:25.524482 139948305139456 deprecation_wrapper.py:119] From /home/chen/anaconda3/lib/python3.7/site-packages/bert/run_classifier.py:774: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.



# 2. 构造模型

首先，再次加载`BERT hub`模型，这次是用来提取计算图；接着，创建一个新的层，训练BERT来用于情感分类任务。这种使用基本上已经训练好的模型的策略叫做`fine-tuning`

In [41]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):
    '''构造一个分类器模型'''
    bert_module = hub.Module(
        BERT_MODEL_HUB,
        trainable=True)
    bert_inputs = dict(input_ids=input_ids,
                      input_mask=input_mask,
                      segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs,
        signature="tokens",
        as_dict=True)
    
    # 使用`pooled_output`用于句级别的分类任务
    # 使用｀sequence_outputs｀用于词级别的任务
    output_layer = bert_outputs['pooled_output']
    ## 获取输出层的隐层大小
    hidden_size = output_layer.shape[-1].value
    
    ## 构造我们自己的最终分类层
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.initializers.glorot_normal())
    output_bias = tf.get_variable("output_bias",
                                 [num_labels],
                                 initializer=tf.initializers.glorot_normal())
    
    with tf.variable_scope("loss"):
        # 使用dropout
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        
        # 将标签转换为one-hot的形式
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        
        ## 如果是预测结果，输出预测标签以及对应的概率
        if is_predicting:
            return (predicted_labels, log_probs)
        
        ## 如果是训练或者评估节点，则计算损失和真实标签
        per_example_loss = -tf.reduce_sum(one_hot_labels*log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)

In [52]:
# 对模型进行包装，适用于训练、评估和预测
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
    def model_fn(features, labels, mode, params):
        input_ids = features['input_ids']
        input_mask = features['input_mask']
        segment_ids = features['segment_ids']
        label_ids = features['label_ids']
        
        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
        # 如果是训练或者评估截断
        if not is_predicting:
            (loss, predicted_labels, log_probs) = create_model(is_predicting, input_ids,
                                                              input_mask, segment_ids, label_ids, num_labels)
            train_op = bert.optimization.create_optimizer(loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
            
            ## 计算评估的metrics
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.metrics.f1_score(label_ids, predicted_labels)
                auc = tf.metrics.auc(label_ids, predicted_labels)
                recall = tf.metrics.recall(label_ids, predicted_labels)
                precision = tf.metrics.precision(label_ids, predicted_labels)
                
                true_pos = tf.metrics.true_positives(label_ids, predicted_labels)
                true_neg = tf.metrics.true_negatives(label_ids, predicted_labels)
                false_pos = tf.metrics.false_positives(label_ids, predicted_labels)
                false_neg = tf.metrics.false_negatives(label_ids, predicted_labels)
                
                return {
                    "eval_accuracy": accuracy,
                    "f1_score": f1_score,
                    "auc": auc,
                    "precision": precision,
                    "recall": recall,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "fasle_negatives": false_neg
                }
            
            eval_metrics = metric_fn(label_ids, predicted_labels)
            
            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics)
            
        else:
            (predicted_labels, log_probs) = create_model(is_predicting, input_ids,
                                                        input_mask, segment_ids, label_ids, num_labels)
            predictions = {
                "probabilities": log_probs,
                "labels": predicted_labels
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)
        
    return model_fn

In [53]:
# 设置超参数
BATCH_SIZE = 32 
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0 
WARMUP_PROPORTION = 0.1 
SAVE_CHECKPOINTS_STEPS = 500 
SAVE_SUMMARY_STEPS = 100 

In [54]:
num_train_steps = int(len(train_features)/ BATCH_SIZE*NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps*WARMUP_PROPORTION)


In [55]:
OUTPUT_DIR = "../model/BERT/imdb/model"

In [56]:
def create_dirs(paths):
    for path in paths:
        if os.path.exists(path):
            os.makedirs(path)

In [57]:
create_dirs([OUTPUT_DIR])

In [58]:
# 设置基础的配置
run_config = tf.estimator.RunConfig(model_dir=OUTPUT_DIR, 
                                   save_summary_steps=SAVE_SUMMARY_STEPS, 
                                   save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [59]:
model_fn = model_fn_builder(
    num_labels=len(label_list),
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(model_fn=model_fn,
                                  config=run_config,
                                  params={"batch_size": BATCH_SIZE})

In [60]:
## drop_remainder=True表示使用TPU
train_input_fn = bert.run_classifier.input_fn_builder(features=train_features,
                                                     seq_length=MAX_SEQ_LENGTH,
                                                     is_training=True,
                                                     drop_remainder=False)

## 2.1 训练阶段

In [None]:
print(f"Begining Training!")
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time: ", datetime.now()-current_time)

Begining Training!


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0905 15:34:35.813420 139948305139456 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 48 vs previous value: 48. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.
W0905 15:58:49.302065 139948305139456 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 63 vs previous value: 63. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.
W0905 16:11:29.363326 139948305139456 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 71 vs previous value: 71. You could increase the global step by passing tf.train.get_

## 2.2 评估阶段

In [None]:
test_input_fn = run_classifier.input_fn_builder(features=test_features,
                                               seq_length=MAX_SEQ_LENGTH,
                                               is_training=False,
                                               drop_remainder=False)

estimator.evaluate(input_fn=test_input_fn, steps=None)

## 2.3 预测阶段

In [None]:
def get_prediction(in_sentences):
    labels = ["Negative", "Positive"]
    input_examples = [run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences]
    input_features = run_classifer.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classfier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

In [None]:
pred_sentences = [
    "That movie was absolutely awful",
    "The acting was a bit lacking",
    "This film was creative and surprising",
    "Absolutely fantastic!"
]

predictions = getPrediction(pred_sentences)