In [2]:
from __future__ import print_function
import numpy as np
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
from rouge import Rouge  # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [3]:
maxlen = 256
batch_size = 16
epochs = 20

config_path = "./model/bert_base/chinese_L-12_H-768_A-12/bert_config.json"
checkpoint_path = './model/bert_base/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './model/bert_base/chinese_L-12_H-768_A-12/vocab.txt'

'''
from transformers import (
   BertTokenizerFast,
   AutoModelForMaskedLM,
   AutoModelForCausalLM,
   AutoModelForTokenClassification,
)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = AutoModelForMaskedLM.from_pretrained('ckiplab/albert-tiny-chinese') # or other models above
'''

"\nfrom transformers import (\n   BertTokenizerFast,\n   AutoModelForMaskedLM,\n   AutoModelForCausalLM,\n   AutoModelForTokenClassification,\n)\n\ntokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')\nmodel = AutoModelForMaskedLM.from_pretrained('ckiplab/albert-tiny-chinese') # or other models above\n"

In [4]:
def load_data(filename):
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            title, content = l.strip().split('\t')
            D.append((title, content))
    return D


In [5]:
train_data = load_data('./dataset/train.tsv')
'''
valid_data = load_data('/root/csl/val.tsv')
test_data = load_data('/root/csl/test.tsv')
'''

"\nvalid_data = load_data('/root/csl/val.tsv')\ntest_data = load_data('/root/csl/test.tsv')\n"

In [6]:
valid_data=train_data[9000:9500]
test_data=train_data[9500:10000]
train_data=train_data[0:9000]
np.shape(valid_data)
#len(test_data)
#train_data

(500, 2)

In [7]:
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)

In [8]:
class data_generator(DataGenerator):
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids = [], []
        for is_end, (title, content) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(
                content, title, maxlen=maxlen
            )
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []

In [9]:
class CrossEntropy(Loss):
    def compute_loss(self, inputs, mask=None):
        y_true, y_mask, y_pred = inputs
        y_true = y_true[:, 1:] 
        y_mask = y_mask[:, 1:] 
        y_pred = y_pred[:, :-1] 
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss

In [10]:
model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,
)

output = CrossEntropy(2)(model.inputs + model.outputs)

model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             10432512    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]   

In [11]:
class AutoTitle(AutoRegressiveDecoder):
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
        return self.last_token(model).predict([token_ids, segment_ids])

    def generate(self, text, topk=1):
        max_c_len = maxlen - self.maxlen
        token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
        output_ids = self.beam_search([token_ids, segment_ids],
                                      topk=topk) 
        return tokenizer.decode(output_ids)


autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)


In [13]:
class Evaluator(keras.callbacks.Callback):
    def __init__(self):
        self.rouge = Rouge()
        self.smooth = SmoothingFunction().method1
        self.best_bleu = 0.

    def on_epoch_end(self, epoch, logs=None):
        metrics = self.evaluate(valid_data) 
        if metrics['bleu'] > self.best_bleu:
            self.best_bleu = metrics['bleu']
            model.save_weights('./best_model/best_model.weights') 
        metrics['best_bleu'] = self.best_bleu
        print('valid_data:', metrics)

    def evaluate(self, data, topk=1):
        total = 0
        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
        for title, content in tqdm(data):
            total += 1
            title = ' '.join(title).lower()
            pred_title = ' '.join(autotitle.generate(content, topk)).lower()
            if pred_title.strip():
                scores = self.rouge.get_scores(hyps=pred_title, refs=title)
                rouge_1 += scores[0]['rouge-1']['f']
                rouge_2 += scores[0]['rouge-2']['f']
                rouge_l += scores[0]['rouge-l']['f']
                bleu += sentence_bleu(
                    references=[title.split(' ')],
                    hypothesis=pred_title.split(' '),
                    smoothing_function=self.smooth
                )
        rouge_1 /= total
        rouge_2 /= total
        rouge_l /= total
        bleu /= total
        return {
            'rouge-1': rouge_1,
            'rouge-2': rouge_2,
            'rouge-l': rouge_l,
            'bleu': bleu,
        }


if __name__ == '__main__':

    evaluator = Evaluator()
    train_generator = data_generator(train_data, batch_size)

    model.fit(
        train_generator.forfit(),
        steps_per_epoch=len(train_generator),
        epochs=epochs,
        callbacks=[evaluator]
    )

else:

    model.load_weights('./best_model.weights')

Epoch 1/20


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




100%|██████████| 500/500 [23:33<00:00,  2.83s/it]


valid_data: {'rouge-1': 0.6158028539136735, 'rouge-2': 0.4951348180758301, 'rouge-l': 0.5907854962074148, 'bleu': 0.3741351440615922, 'best_bleu': 0.3741351440615922}
Epoch 2/20

100%|██████████| 500/500 [22:17<00:00,  2.67s/it]


valid_data: {'rouge-1': 0.6303881819544842, 'rouge-2': 0.5159175007689925, 'rouge-l': 0.6018499868342577, 'bleu': 0.4054622997095951, 'best_bleu': 0.4054622997095951}
Epoch 3/20

100%|██████████| 500/500 [22:58<00:00,  2.76s/it]


valid_data: {'rouge-1': 0.6398425442957897, 'rouge-2': 0.5215151045535682, 'rouge-l': 0.6101409477645894, 'bleu': 0.41144678693336645, 'best_bleu': 0.41144678693336645}
Epoch 4/20

100%|██████████| 500/500 [21:57<00:00,  2.63s/it]


valid_data: {'rouge-1': 0.6543956022717533, 'rouge-2': 0.5379111348429688, 'rouge-l': 0.6257551342575403, 'bleu': 0.42685993125671584, 'best_bleu': 0.42685993125671584}
Epoch 5/20

100%|██████████| 500/500 [22:59<00:00,  2.76s/it]

valid_data: {'rouge-1': 0.6480196514713105, 'rouge-2': 0.5304925572539216, 'rouge-l': 0.618494106390483, 'bleu': 0.42074405775929713, 'best_bleu': 0.42685993125671584}





Epoch 6/20

100%|██████████| 500/500 [22:16<00:00,  2.67s/it]


valid_data: {'rouge-1': 0.6608329383356449, 'rouge-2': 0.5438695462165654, 'rouge-l': 0.630275185410816, 'bleu': 0.4358682638637675, 'best_bleu': 0.4358682638637675}
Epoch 7/20

100%|██████████| 500/500 [21:38<00:00,  2.60s/it]


valid_data: {'rouge-1': 0.659563577450352, 'rouge-2': 0.5429167810856941, 'rouge-l': 0.6300751416493777, 'bleu': 0.43956474512983784, 'best_bleu': 0.43956474512983784}
Epoch 8/20

100%|██████████| 500/500 [22:40<00:00,  2.72s/it]

valid_data: {'rouge-1': 0.6543015097287772, 'rouge-2': 0.5371970492823226, 'rouge-l': 0.6229113236328293, 'bleu': 0.4337961845142361, 'best_bleu': 0.43956474512983784}





Epoch 9/20

100%|██████████| 500/500 [22:34<00:00,  2.71s/it]

valid_data: {'rouge-1': 0.658907389833466, 'rouge-2': 0.5389963325073988, 'rouge-l': 0.6260189168115922, 'bleu': 0.4346950010241607, 'best_bleu': 0.43956474512983784}





Epoch 10/20

100%|██████████| 500/500 [23:04<00:00,  2.77s/it]

valid_data: {'rouge-1': 0.6577414617170784, 'rouge-2': 0.5364974917938595, 'rouge-l': 0.6249216077659924, 'bleu': 0.43225270446180164, 'best_bleu': 0.43956474512983784}





Epoch 11/20

100%|██████████| 500/500 [21:29<00:00,  2.58s/it]

valid_data: {'rouge-1': 0.6532514769268096, 'rouge-2': 0.5324391080840587, 'rouge-l': 0.6217920989635052, 'bleu': 0.4253526344863265, 'best_bleu': 0.43956474512983784}





Epoch 12/20

100%|██████████| 500/500 [23:57<00:00,  2.88s/it]

valid_data: {'rouge-1': 0.662291536150974, 'rouge-2': 0.5410714141210865, 'rouge-l': 0.6282255752851701, 'bleu': 0.43527866256301384, 'best_bleu': 0.43956474512983784}





Epoch 13/20

100%|██████████| 500/500 [22:41<00:00,  2.72s/it]

valid_data: {'rouge-1': 0.6560350380753381, 'rouge-2': 0.5351827340365641, 'rouge-l': 0.622198318078268, 'bleu': 0.4293557398438032, 'best_bleu': 0.43956474512983784}





Epoch 14/20

100%|██████████| 500/500 [24:25<00:00,  2.93s/it]


valid_data: {'rouge-1': 0.663918848924954, 'rouge-2': 0.5451749799858874, 'rouge-l': 0.6313485775665693, 'bleu': 0.44398242698145746, 'best_bleu': 0.44398242698145746}
Epoch 15/20

100%|██████████| 500/500 [23:32<00:00,  2.82s/it]

valid_data: {'rouge-1': 0.6580831845254097, 'rouge-2': 0.5370870195602019, 'rouge-l': 0.6246869335287693, 'bleu': 0.43476821360323853, 'best_bleu': 0.44398242698145746}





Epoch 16/20

100%|██████████| 500/500 [23:07<00:00,  2.78s/it]

valid_data: {'rouge-1': 0.6665853120042355, 'rouge-2': 0.546766486183051, 'rouge-l': 0.6302490393035053, 'bleu': 0.4418812179562024, 'best_bleu': 0.44398242698145746}





Epoch 17/20

100%|██████████| 500/500 [24:02<00:00,  2.89s/it]

valid_data: {'rouge-1': 0.6608295575301463, 'rouge-2': 0.53913765258246, 'rouge-l': 0.6236618592851628, 'bleu': 0.4364103362856684, 'best_bleu': 0.44398242698145746}





Epoch 18/20

100%|██████████| 500/500 [24:08<00:00,  2.90s/it]

valid_data: {'rouge-1': 0.6590463899367772, 'rouge-2': 0.540132856037617, 'rouge-l': 0.6244007096911808, 'bleu': 0.4368373138606962, 'best_bleu': 0.44398242698145746}





Epoch 19/20

100%|██████████| 500/500 [23:42<00:00,  2.84s/it]

valid_data: {'rouge-1': 0.6494262525823564, 'rouge-2': 0.5280974838474484, 'rouge-l': 0.6154502923262284, 'bleu': 0.4241011663078994, 'best_bleu': 0.44398242698145746}





Epoch 20/20

100%|██████████| 500/500 [24:07<00:00,  2.89s/it]

valid_data: {'rouge-1': 0.6568439072329715, 'rouge-2': 0.5342690418332792, 'rouge-l': 0.6195137798868043, 'bleu': 0.430800833757064, 'best_bleu': 0.44398242698145746}





In [14]:
import tensorflow as tf
from tensorflow.python.client import device_lib

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
device_lib.list_local_devices()

Num GPUs Available:  1


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3463579379728475991, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 10175972952992078534
 physical_device_desc: "device: XLA_CPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 6934559456
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16742833666877936717
 physical_device_desc: "device: 0, name: GeForce RTX 2070 SUPER, pci bus id: 0000:9e:00.0, compute capability: 7.5", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 17294182559736239460
 physical_device_desc: "device: XLA_GPU device"]

In [15]:
os.environ["RECOMPUTE"]='1'

In [26]:
for i in range(5):
    print('原文:'+test_data[i][1])
    print('人工摘要:'+test_data[i][0])
    print('機器摘要:'+autotitle.generate(test_data[i][1]))

原文:為解決車載自組織網路負載較重時控制通道擁塞和通道利用率低的問題,提出一種分時多工機制的非同步車載自組織網多通道MAC(Media Access Control)協議——ATMP(Asynchronous TDMA-based multi-channel MAC Protocol).ATMP協議採用分時多工的非同步接入機制實現節點分時段接入控制通道,減少併發接入控制通道的節點數目,降低碰撞概率;進一步,為了解決多通道協調資訊丟失問題,ATMP協議使用節點協作機制來獲取節點遺漏的通道協調資訊,有效降低因協調資訊缺失造成的資料通道服務資訊碰撞概率.模擬結果表明,ATMP協議在碰撞概率、安全訊息時延及控制通道吞吐量指標上優於IEEE1609.4標準、AMCP協議和AMCMAC協議.
人工摘要:車載自組織網路中基於分時多工的非同步多通道MAC協議
機器摘要:分時多工機制的非同步車載自組織網多通道mac協議協議
原文:分析現有一些Vague集相似度量方法,並指出其不足。考慮在實際應用中,未知度對相似度量的影響,從動態的角度出發,挖掘未知度中包含的贊成與反對資訊,提出了一種基於未知度的Vague集相似度量新方法,並將該相似度量方法應用於模糊資料檢測中,通過實際應用說明該方法更加有效。
人工摘要:Vague集相似度量及其在模糊資料檢測中的應用
機器摘要:基於未知度的vague集相似度量方法
原文:目的:與CAG對照分析來評價64層螺旋CT冠狀動脈成像在房顫患者中診斷有血流動力學意義的冠狀動脈狹窄的準確性。方法:58例房顫患者行64層螺旋CT冠狀動脈成像,掃描前均未服用倍它樂克。血管影象質量分為好、中等和差。以CAG作為參考標準,分別基於血管節段和患者水平來分析MDCTCA診斷有血流動力學意義的冠狀動脈狹窄的敏感性、特異性、陽性預測價值和陰性預測價值。診斷價值的評價首先僅限於影象質量達到診斷要求的血管節段和患者,進一步的分析中將影象質量不能診斷的血管節段和患者均作為陽性來處理。結果:58例患者有645段(96.55%)影象質量為中等以上,診斷有血流動力學意義的敏感性、特異性、陽性預測價值和陰性預測價值分別是86.21%(25/29)、99.35%(612/616)、86.21%(25/29)和99.35%(612/616)。將23段影象質量沒有達到診斷要求的血管均作為陽性後,C

In [14]:
s="在就一心有一個報導提出，目前針對早期新冠肺炎的唯一治療方法，那很多一聲卻不知道，所以想請問您不是這篇報導所說的，唯一治療方法是哪一種治療方法呢？今天的報導，這個也搞的有點神色醫師喔？現在看那個標題寫的好像有一個靈丹妙藥，大家都給忽視掉，那其實並不是這樣的。那篇報導裡面提到一九九四單克隆抗體療法，那實際上，美帝陸續有報導，那麼他這個改版都只是要強調就是很多醫生，其實並沒有很充分的陰影，有這樣一個工具就是當克隆抗體的這個治療的方案來，真的一些病人特別是對於早期患有無精症，阿或者是中度症狀，的病人，還有一些情況可以用來去向來是怕被來這就是預防性的治療，所以這份報告，我覺得就是提醒而已。省，其實你是有一個額外的工具是可以用的，並不是說，病人來了。"
print("原文:"+s)
print("摘要:"+autotitle.generate(s))

原文:在就一心有一個報導提出，目前針對早期新冠肺炎的唯一治療方法，那很多一聲卻不知道，所以想請問您不是這篇報導所說的，唯一治療方法是哪一種治療方法呢？今天的報導，這個也搞的有點神色醫師喔？現在看那個標題寫的好像有一個靈丹妙藥，大家都給忽視掉，那其實並不是這樣的。那篇報導裡面提到一九九四單克隆抗體療法，那實際上，美帝陸續有報導，那麼他這個改版都只是要強調就是很多醫生，其實並沒有很充分的陰影，有這樣一個工具就是當克隆抗體的這個治療的方案來，真的一些病人特別是對於早期患有無精症，阿或者是中度症狀，的病人，還有一些情況可以用來去向來是怕被來這就是預防性的治療，所以這份報告，我覺得就是提醒而已。省，其實你是有一個額外的工具是可以用的，並不是說，病人來了。
摘要:面向早期新冠肺炎的克隆抗體治療方法研究


In [13]:
model.load_weights('./best_model/best_model.weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2e925ec4b00>