In [1]:
import progressbar

from preprocessing import *
from crfsuite_model import *
from submit import *

In [2]:
level = 'char'

### Step 1. 预处理
预处理只是简单的去除存在跨行的命名实体的多个起始位置，只保留前后两个数字存成csv格式

例如糖尿\n病 ann位置为`1 2;3 4`，在csv中改为`1 4`

In [3]:
print('1. Preprocessing ...')
clean_ann()

1. Preprocessing ...


### Step 2. 生成单字的训练集和测试集
这里由于是最后代码审核，就直接使用全量数据集训练了

In [4]:
print('2. Generating train/test set from raw text ...')
generate_char_level_train_set()
generate_char_level_test_set()

2. Generating train/test set from raw text ...


### Step 3. 载入训练集测试集

In [5]:
print('3. Loading train data ...')
train_sents = load_data('./data/' + level + '_level_train_set/', level)
x_train = [sent2features(s, level) for s in progressbar.progressbar(train_sents)]
y_train = [sent2labels(s, level) for s in train_sents]
del train_sents  # crf模型吃内存，显示清理能减少内存占用

3. Loading train data ...


100% (363 of 363) |######################| Elapsed Time: 0:01:02 Time:  0:01:02


### Step 4. 训练CRF模型
使用pycrfsuite训练CRF模型，时间需求约一天，模型约300M，要求16G内存（Windows系统内存需求可能更高点）。

主要可调参数为C1、C2以及迭代次数，由于训练时间较长，这里就不执行了。

可以在model.train()中调整模型存放目录。我们训练好了一个crf模型，请在readme中查看。

In [None]:
print('4. Training CRF model ...')
model = pycrfsuite.Trainer(verbose=True)
for xseq, yseq in zip(x_train, y_train):
    model.append(xseq, yseq)
model.set_params({'c1': 1e-3,  # coefficient for L1 penalty
                  'c2': 1,  # coefficient for L2 penalty
                  'max_iterations': 500,  # stop earlier
                  # include transitions that are possible, but not observed
                  'feature.possible_transitions': True})
# 'feature.minfreq': 3})
model.train('./model/tianchi_ner_' + level + '_level.crfsuite')
del x_train
del y_train

### Step5. 生成测试结果
这里使用我们训练好的模型展示结果，可以在tagger.open()重新指定模型目录

In [6]:
print('5. Generating test result ...')
tagger = pycrfsuite.Tagger()
tagger.open('./model/tianchi_ner_char_level_1000.crfsuite')

directory = './data/' + level + '_level_test_set/'
filenames = os.listdir(directory)
for filename in filenames:
    if filename.endswith('.csv'):
        sent = []
        df = pd.read_csv(directory + filename, skip_blank_lines=False)
        for index, row in df.iterrows():
            if level == 'word':
                sent.append((row['seq'], row['pos']))
            elif level == 'char':
                sent.append((row['char']))
        x_test = sent2features(sent, level)
        y_test = tagger.tag(x_test)

        f = codecs.open('./data/submit/' + filename[:-4] + '.ann', 'w')
        id = 0
        start_end_pairs = [[0, 0]]  # 用列表的形式存储起始字符
        flag = False
        entity = ''  # 用于字符串拼接
        for i in range(0, len(y_test)):
            if y_test[i] != 'O':
                if y_test[i][:1] == 'B' and flag == False:  # 若满足条件，则代表是实体的开始
                    entity = sent[i][0]
                    flag = True
                    start_end_pairs[-1][0] = start_end_pairs[-1][1]
                    del (start_end_pairs[:-1])  # B重新开始记录起止位置，所以删除前面的元素
                elif y_test[i][:1] == 'B' and flag == True:  # 代表上一个实体刚结束，新的实体紧接着开始
                    id += 1
                    write_format(f, id, y_test[i - 1][2:], start_end_pairs, entity)
                    start_end_pairs[-1][0] = start_end_pairs[-1][1]
                    del (start_end_pairs[:-1])  # B重新开始记录起止位置，所以删除前面的元素
                    # entity = ''
                    entity = sent[i][0]
                elif y_test[i][:1] == 'I':  # 代表当前token仍然属于同一实体
                    if str(sent[i][0]) == '\n':
                        entity += ' '
                        start_end_pairs.append([start_end_pairs[-1][1] + 1, start_end_pairs[-1][1]])
                    else:
                        entity += str(sent[i][0])
            elif flag:
                flag = False
                id += 1
                write_format(f, id, y_test[i - 1][2:], start_end_pairs, entity)
            start_end_pairs[-1][1] += len(str(sent[i][0]))
        f.close()

5. Generating test result ...
