<a href="https://colab.research.google.com/github/a07458666/TBrainNLP/blob/master/2_BERT_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kashgari==1.1.5
!pip install tensorflow-gpu==1.15.0
!pip install keras-applications==1.0.8
!pip install keras==2.3.1

In [None]:
import os
import re
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm import tqdm

drive.mount('/content/drive')
os.chdir(r"/content/drive/My Drive/tbrainData/")

## Load Data

In [None]:
all_content = np.load('all_content.npy', allow_pickle = True)
all_BIO = np.load('all_BIO.npy', allow_pickle = True)
all_article_index = np.load('all_article_index.npy', allow_pickle = True)

In [None]:
dataset = pd.DataFrame({'content': all_content, 'BIO': all_BIO, 'all_article_index': all_article_index})
dataset["NoName"] = dataset["BIO"].apply(lambda x: int(all(np.array(x) == "O")))
dataset.head()

## split train / val / test dataset

In [None]:
def train_validate_test_split(input_df, train_percent=.6, validate_percent=.2, seed=None):
    input_list = np.unique(input_df['all_article_index']).tolist()
    np.random.seed(seed)
    perm = np.random.permutation(range(len(input_list))).tolist()
    m = len(input_list)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = list(input_list[i] for i in perm[:train_end])
    validate = list(input_list[i] for i in perm[train_end:validate_end])
    test = list(input_list[i] for i in perm[validate_end:])
        
    # -- train data --
    train_data = [dataset[dataset['all_article_index'].isin(train)]['content'].tolist(), dataset[dataset['all_article_index'].isin(train)]['BIO'].tolist(), dataset[dataset['all_article_index'].isin(train)]['all_article_index'].tolist()]

    # -- valid data --
    valid_data = [dataset[dataset['all_article_index'].isin(validate)]['content'].tolist(), dataset[dataset['all_article_index'].isin(validate)]['BIO'].tolist(), dataset[dataset['all_article_index'].isin(validate)]['all_article_index'].tolist()]

    # -- test data --
    test_data = [dataset[dataset['all_article_index'].isin(test)]['content'].tolist(),  dataset[dataset['all_article_index'].isin(test)]['BIO'].tolist(), dataset[dataset['all_article_index'].isin(test)]['all_article_index'].tolist()]

    return train_data, valid_data, test_data

In [None]:
train_data, valid_data, test_data = train_validate_test_split(input_df=dataset, train_percent=.5, validate_percent=.3, seed=777)
train_x, train_y, train_idx =  train_data[0], train_data[1], train_data[2]
valid_x, valid_y, valid_idx =  valid_data[0], valid_data[1], valid_data[2]
test_x, test_y, test_idx =  test_data[0], test_data[1], test_data[2]
print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}")


## Download BERT model

In [None]:
!wget -nc "https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip"
!unzip -o "chinese_L-12_H-768_A-12.zip"

## Model

In [None]:
import kashgari
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiGRU_Model
from kashgari.corpus import ChineseDailyNerCorpus

# 下载 BERT 权重

bert_embedding = BERTEmbedding('chinese_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=128)
model = BiGRU_Model(bert_embedding)

train_x, train_y = ChineseDailyNerCorpus.load_data('train')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
model.fit(train_x, train_y, valid_x, valid_y, epochs=5)