In [2]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

dataset.tsv


In [3]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. All changes under this directory will be kept even after reset. Please clean unnecessary files in time to speed up environment loading.
!ls /home/aistudio/work

In [4]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, you need to use the persistence path as the following:
!mkdir /home/aistudio/external-libraries
!pip install beautifulsoup4 -t /home/aistudio/external-libraries

Looking in indexes: https://mirror.baidu.com/pypi/simple/
Collecting beautifulsoup4
[?25l  Downloading https://mirror.baidu.com/pypi/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 12.6MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4)
  Downloading https://mirror.baidu.com/pypi/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.0.1


In [5]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可:
# Also add the following code, so that every time the environment (kernel) starts, just run the following code:
import sys
sys.path.append('/home/aistudio/external-libraries')

In [1]:
!pip install paddlehub==1.8.2
import paddlehub as hub

Looking in indexes: https://mirror.baidu.com/pypi/simple/
Collecting paddlehub==1.8.2
[?25l  Downloading https://mirror.baidu.com/pypi/packages/b8/8d/46b67feae675d0ac106234b3c5806ba6198719fe850d61381c3311cdea6c/paddlehub-1.8.2-py3-none-any.whl (336kB)
[K     |████████████████████████████████| 337kB 14.0MB/s eta 0:00:01
Installing collected packages: paddlehub
  Found existing installation: paddlehub 2.0.4
    Uninstalling paddlehub-2.0.4:
      Successfully uninstalled paddlehub-2.0.4
Successfully installed paddlehub-1.8.2


In [2]:
import numpy as np
import pandas as pd
import json
import time
import re

In [3]:
model=hub.Module(name="bert_chinese_L-12_H-768_A-12")

[2021-03-12 10:17:42,212] [    INFO] - Installing bert_chinese_L-12_H-768_A-12 module


Downloading bert_chinese_L-12_H-768_A-12
Uncompress /home/aistudio/.paddlehub/tmp/tmp97cr5aeb/bert_chinese_L-12_H-768_A-12


[2021-03-12 10:18:05,722] [    INFO] - Successfully installed bert_chinese_L-12_H-768_A-12-1.1.0


In [4]:
# data in result.json is the labeled ming data
with open('/home/aistudio/result.json') as jf:
    ming=json.load(jf)


#unpack .json file, wash the data and transfer them into ndarray
def dataGen(ming):
    person_ids=np.array(list(ming.keys()))
    person_ids.sort()

    #idNum=len(person_ids)

    x_data=[]
    y_data=[]

    indexer=0
    for person_id in person_ids:
        
        char_tag=ming[person_id]['char_tag']
        x_data.append([])
        y_data.append([])
        omit_len=len(person_id)
        for i in range(omit_len+1,len(char_tag)):
            x_data[indexer].append(char_tag[i][0])
            y_data[indexer].append(char_tag[i][1])
         
        indexer=indexer+1

    for i in range(0,len(y_data)):
      for j in range(0,len(y_data[i])):
        old_text=y_data[i][j]
        #convert labels like 'B_date_reign' into 'B-date-reign', which is readable by kashgari
        new_text=old_text.replace("_","-")
        y_data[i][j]=new_text


    return x_data,y_data,person_ids


#construct train,validate and test set
#train_set_rate indicates the proportion of trainning data
#validate_set_rate indicates the proportion of validation data
def splitTrain(x_data,y_data,person_ids,train_set_rate,validate_set_rate):
    x_data=np.array(x_data)
    y_data=np.array(y_data)
    
    temp=np.array([x_data,y_data])
    temp=temp.T
    
    ming_data=pd.DataFrame(temp,index=person_ids,columns=['text_a','label'])

    #test on converting data 2 tsv
    for i in range(0,len(ming_data['text_a'])):
        #print(i)
        ming_data['text_a'][i]='\002'.join(ming_data['text_a'][i])
        ming_data['label'][i]='\002'.join(ming_data['label'][i])
        ##ming_data['text_a'][i]+='\002'
        ##ming_data['label'][i]+='\002'
        #ming_data['text_a'][i]=str(ming_data['text_a'][i])
        #ming_data['label'][i]=str(ming_data['label'][i])
        
        

    np.random.seed(int(time.time()))
    ming_data=ming_data.sample(frac=1.0)
    
    idNum=len(person_ids)
    train_size=int(np.floor(idNum*train_set_rate))
    validate_size=int(np.floor(idNum*validate_set_rate))

    train_set=ming_data[0:train_size]

    validate_set=ming_data[train_size:train_size+validate_size]
    test_set=ming_data[train_size+validate_size:idNum]
    return train_set,validate_set,test_set,ming_data


#Build the first model: 50% of ming as train and 20% of ming as test
x_data,y_data,person_ids=dataGen(ming)
t,v,testing,ming_data=splitTrain(x_data,y_data,person_ids,0.75,0.25)

#generate x,y of train,validate and test
x_train=np.array(t['text_a'])
y_train=np.array(t['label'])
x_validate=np.array(v['text_a'])
y_validate=np.array(v['label'])

x_test=np.array(testing['text_a'])
y_test=np.array(testing['label'])

t.to_csv('train.csv')
v.to_csv('validate.csv')

train_len=[]
for i in range(0,len(x_train)):
  train_len.append(len(x_train[i]))

max_len=max(train_len)


tag_list = tag_list = ["O",
        "B_date_reign", "I_date_reign",
        "B_date_year", "I_date_year",
        "B_office_voa", "I_office_voa",
        "B_office_title", "I_office_title",
        "B_place_placename", "I_place_placename"]

In [5]:
# Part of Task1: save the data into tsv
t.to_csv('/home/aistudio/train.tsv',sep='\t',columns=['text_a','label'],encoding='utf_8_sig',index=None)
v.to_csv('/home/aistudio/validate.tsv',sep='\t',columns=['text_a','label'],encoding='utf_8_sig',index=None)
testing.to_csv('/home/aistudio/testing.tsv',sep='\t',columns=['text_a','label'],encoding='utf_8_sig',index=None)
ming_data.to_csv('/home/aistudio/data/dataset.tsv',sep='\t',columns=['text_a','label'],encoding='utf_8_sig',index=None)

predict_data=pd.read_table('/home/aistudio/test_data.txt')
text=predict_data['content_without_name']
text.to_csv('/home/aistudio/predict.tsv',encoding='utf_8_sig')

In [6]:
bio=predict_data['content_without_name']
bio_ids=predict_data['id']


In [8]:
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

# Construct the dataset for task1
class DemoDataset(BaseNLPDataset):
    """DemoDataset"""
    def __init__(self):
        # 数据集存放位置
        self.dataset_dir = "/home/aistudio/"
        super(DemoDataset, self).__init__(
            base_path=self.dataset_dir,
            train_file="train.tsv",
            dev_file="validate.tsv",
            test_file="testing.tsv",
            # 如果还有预测数据（不需要文本类别label），可以放在predict.tsv
            predict_file="predict.tsv",
            train_file_with_header=True,
            dev_file_with_header=True,
            test_file_with_header=True,
            predict_file_with_header=True,
            # 数据集类别集合
            label_list=["O",
        "B-date-reign", "I-date-reign",
        "B-date-year", "I-date-year",
        "B-office-voa", "I-office-voa",
        "B-office-title", "I-office-title",
        "B-place-placename", "I-place-placename"])
task1_dataset = DemoDataset()

In [9]:
#生成reader，对数据进行预处理
reader = hub.reader.SequenceLabelReader(
        dataset=task1_dataset,
        vocab_path=model.get_vocab_path(),
        max_seq_len=512)

#选择fine-tune的优化策略
strategy=hub.AdamWeightDecayStrategy(
    weight_decay=0.01,
    learning_rate=1e-4,
    warmup_proportion=0.1
)

#定义运行配置
config=hub.RunConfig(
    use_cuda=True,
    num_epoch=80,
    checkpoint_dir="chinese_wwm_base_seq_label_demo",
    batch_size=16,
    eval_interval=50,
    strategy=strategy
)

[2021-03-12 10:21:12,291] [    INFO] - Dataset label map = {'O': 0, 'B-date-reign': 1, 'I-date-reign': 2, 'B-date-year': 3, 'I-date-year': 4, 'B-office-voa': 5, 'I-office-voa': 6, 'B-office-title': 7, 'I-office-title': 8, 'B-place-placename': 9, 'I-place-placename': 10}
[2021-03-12 10:21:12,340] [    INFO] - Checkpoint dir: chinese_wwm_base_seq_label_demo


In [10]:
#组建fine tune task(for task 1)
inputs, outputs, program = model.context(
    trainable=True, max_seq_len=512)

sequence_output=outputs["sequence_output"]

feed_list=[
    inputs["input_ids"].name,
    inputs["position_ids"].name,
    inputs["segment_ids"].name,
    inputs["input_mask"].name
]

#请确保paddlehub 版本为1.8.2，否则会报错（parameter has no attribute...)
seq_label_task1 = hub.SequenceLabelTask(
    data_reader=reader,
    feature=sequence_output,
    feed_list=feed_list,
    max_seq_len=512,
    num_classes=task1_dataset.num_labels,
    config=config,
    add_crf=True)




In [11]:
#执行finetune任务
#Execute finetune
task1_rate=seq_label_task1.finetune_and_eval()

[2021-03-12 10:21:27,636] [    INFO] - Strategy with warmup, linear decay, slanted triangle learning rate, weight decay regularization, 
[2021-03-12 10:21:27,724] [    INFO] - Try loading checkpoint from chinese_wwm_base_seq_label_demo/ckpt.meta
[2021-03-12 10:21:29,879] [    INFO] - PaddleHub model checkpoint loaded. current_epoch=81, global_step=33280, best_score=0.99820
[2021-03-12 10:21:29,881] [    INFO] - PaddleHub finetune start
[2021-03-12 10:21:29,883] [    INFO] - PaddleHub finetune finished.


In [12]:
#执行预测任务
#Execute predict task
#import test_data to predict
predict=[predict_data['content_without_name']]
predict=[]
for i in range(0,len(predict_data['content_without_name'])):
    predict.append([predict_data['content_without_name'][i]])

print('done')

#pred=seq_label_task_1.predict(data=predict_data['content_without_name'])

done


In [13]:
#predict
pred=seq_label_task1.predict(data=predict)

[2021-03-12 10:21:38,042] [    INFO] - PaddleHub predict start
[2021-03-12 10:21:38,044] [    INFO] - Load the best model from chinese_wwm_base_seq_label_demo/best_model
[2021-03-12 10:23:02,476] [    INFO] - PaddleHub predict finished.


In [14]:
# The output of paddlehub is written in a strange order
# The following codes convert the output labels into readable tags
results=[p.run_results for p in pred]

inv_label_map={0:'O',1:'B-date-reign',2:'I-date-reign',3:'B-date-year',4:'I-date-year',
5:'B-office-voa',6:'I-office-voa',7:'B-office-title',8:'I-office-title',9:'B-place-placename',
10:'I-place-placename'}

tags=[]

for num_batch, batch_results in enumerate(results):
    infers = batch_results[0].reshape([-1]).astype(np.int32).tolist()
    
    #acquire the length of each text in batch #num_batch
    np_lens = batch_results[1]
    
    
    vernier=0

    for index, np_len in enumerate(np_lens):
        
        #labels = infers[index * 400:(index + 1) * 400]
        labels=infers[vernier:vernier+np_len]
        vernier=vernier+np_len

        label_str = []
        count = 0
        for label_val in labels:
            label_str.append(inv_label_map[label_val])
            count += 1
            if count == np_len-1:
                break
        tags.append(label_str)


for i in range(0,len(tags)):
    #print(i)
    tags[i]=tags[i][1:len(tags[i])]

In [16]:
text=[]

#Write predicted tags into .txt files

for i in range(0,len(predict)):
    text.append(predict[i][0])

data={'text':text,'tags_predicted':tags}

frame=pd.DataFrame(data)

for i in range(0,len(text)):
    print(i)
    ner_result={'char':list(text[i]),'tag':list(tags[i])}
    frame=pd.DataFrame(ner_result,columns=['char','tag'])
    frame.to_csv('/home/aistudio/jin_ner_0203/'+str(bio_ids[i])+'.txt',sep=" ")

0


FileNotFoundError: [Errno 2] No such file or directory: '/home/aistudio/jin_ner_0312/1.txt'

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 