# VERSION2 
## 结巴分词+词性标注+CRF++模型训练进行实体识别

### 参考网址
https://taku910.github.io/crfpp/#install (官网)<br>
https://zhuanlan.zhihu.com/p/27955621<br>
http://www.hankcs.com/nlp/the-crf-model-format-description.html<br>
http://f.dataguru.cn/thread-866700-1-1.html<br>

## 1. 加载模块和数据

In [108]:
import os
import re
from pprint import pprint
import numpy as np
import pandas as pd
import time
import jieba
import jieba.analyse
import jieba.posseg as pesg

In [109]:
jieba.initialize()
jieba.load_userdict('user_dict1.txt')

In [110]:
data = pd.read_csv("bz_record_user_q_train.csv",sep='\n')
data.columns = ['msg_content']

In [111]:
data.msg_content = data.msg_content.astype(str)

In [115]:
# data.msg_from = data.msg_from.astype(str)
# data2 = data[data.msg_from.apply(len)==10].msg_content
# data2.head()

## 2. 数据处理成模型需要的格式

In [114]:
def merge_noun(a):
    idx1 = 0
    ll = []
    while idx1 < len(a):
        fg = False
        word = a[idx1].word
        flag = a[idx1].flag
        idx2 = idx1+1
        while idx2 < len(a):
            if a[idx1].flag[0] == a[idx2].flag[0] == 'n':
                word += a[idx2].word
                idx2 += 1
                fg = True
            else:
                break
        if fg:
            ll.append((word, 'n'))
        else:
            ll.append((word, flag))
        idx1 = idx2
    return ll

In [115]:
def tokenize(b):
    ll2 = []
    for sen in b:
        word = sen[0]
        flag = sen[1]
        if flag.startswith('n'):
            if len(word) == 1:
                ll2.append((word, 'S'+flag,'S-N'))
            else:
                ll2 += list(zip(*[[tk for tk in word],['B'+flag]+['M'+flag]*(len(word)-2)+['E'+flag],['B-N']+['M-N']*(len(word)-2)+['E-N']]))
        else:
            if len(word) == 1:
                ll2.append((word, 'S'+flag,'O'))
            else:
                ll2 += list(zip(*[[tk for tk in word],['B'+flag]+['M'+flag]*(len(word)-2)+['E'+flag],['O']*len(word)]))
    return ll2

In [116]:
def ff(s):
    pesg_cut = pesg.lcut(s.strip().replace(' ','，').replace('\n',''))
    merge_noun_result = merge_noun(pesg_cut)
    return tokenize(merge_noun_result)

In [260]:
ff('问下易用宝好不好用？')

[('问', 'Bz', 'O'),
 ('下', 'Ez', 'O'),
 ('易', 'Bv', 'O'),
 ('用', 'Ev', 'O'),
 ('宝', 'Snr', 'S-N'),
 ('好', 'Bl', 'O'),
 ('不', 'Ml', 'O'),
 ('好', 'El', 'O'),
 ('用', 'Sp', 'O'),
 ('？', 'Sx', 'O')]

In [13]:
def ner_postag(df_series,func):
    t1 = time.time()
    df = df_series.apply(func)
    print('df apply time : ' + str(time.time() - t1))
    t2 = time.time()
    l = []
    for row in df:
        l += row
        if row[-1][0] not in ('。','？','！','.','?','!'):
            l.append(('。','Sx','O'))
        l.append(())
    del df
    print('l append time : ' + str(time.time() - t2))
    return l

In [11]:
data_ner = ner_postag(data.msg_content,ff)

df apply time : 8383.529587745667
l append time : 36.72373938560486


In [14]:
b = pd.DataFrame(data_ner)
b.columns = ['a','b','c']

In [238]:
b.to_csv('train_all.csv',encoding='utf-8',header=None,index=None,sep=' ')

## 3. 跑模型（这个过程在linux命令行中完成）

```python
..\..\crf_learn  -f 3 -c 4.0 template train.data model
```

可选参数：
```

-f, –freq=INT使用属性的出现次数不少于INT(默认为1)

-m, –maxiter=INT设置INT为LBFGS的最大迭代次数 (默认10k)

-c, –cost=FLOAT      设置FLOAT为代价参数，过大会过度拟合 (默认1.0)

-e, –eta=FLOAT设置终止标准FLOAT(默认0.0001)

-C, –convert将文本模式转为二进制模式

-t, –textmodel为调试建立文本模型文件

-a, –algorithm=(CRF|MIRA)

选择训练算法，默认为CRF-L2

-p, –thread=INT线程数(默认1)，利用多个CPU减少训练时间

-H, –shrinking-size=INT

设置INT为最适宜的跌代变量次数 (默认20)

-v, –version显示版本号并退出

-h, –help显示帮助并退出
```

输出：
```
iter：迭代次数。当迭代次数达到maxiter时，迭代终止

terr：标记错误率

serr：句子错误率

obj：当前对象的值。当这个值收敛到一个确定值的时候，训练完成

diff：与上一个对象值之间的相对差。当此值低于eta时，训练完成
```

In [None]:
crf_learn -m 256 -e 0.0008 -c 3.0 template train_all.data model_m256_e0008_c3_0424

In [12]:
# crf_learn -m 256 -e 0.0008 -c 3.0 template train_all.data model_m256_e0008_c3_0423

## 4. 模型加载与测试（命令行加载测试或调用python接口方式）

### 4.1 命令行加载

 crf_test -m model_file test_files >> result_file

### 4.2 python接口方式
首先
```
linux环境安装CRF++及python接口
记录一下步骤：
切换root权限
./configure
make
make install
cd python
python setup.py build
python setup.py install
ln -s /usr/local/lib/libcrfpp.so* /usr/lib64/
```

In [122]:
import CRFPP

In [None]:
## 这里可以写一个try:
# except RuntimeError, e:
#    print("RuntimeError: ", e)

In [263]:
tagger = CRFPP.Tagger("-m model_10000_m256_e0003_c2 -v 3 -n2")
tagger.clear()

True

In [264]:
def tokenize2(b):
    ll2 = []
    for sen in b:
        word = sen[0]
        flag = sen[1]
        if len(word) == 1:
            ll2.append(word+' '+'S'+flag)
        else:
            ll2 += list(map(lambda x:x[0]+' '+x[1], zip([x for x in word], ['B'+flag]+['M'+flag]*(len(word)-2)+['E'+flag])))
    return ll2
def input_user_q(s):
    pesg_cut = pesg.lcut(s.strip().replace(' ','，').replace('\n',''))
    merge_noun_result = merge_noun(pesg_cut)
    return tokenize2(merge_noun_result)

In [265]:
s = ''
l = input_user_q(s)
# 加载
[tagger.add(x) for x in l]

[]

In [266]:
tagger.parse()
print([tagger.x(i,0) for i in range(tagger.size())])
print([tagger.y2(i) for i in range(tagger.size())])

[]
[]


In [267]:
def ner_f(st,tagger):
    l = []
    s = ''
    for i in range(tagger.size()):
        if tagger.y2(i) == 'B-N' or tagger.y2(i) == 'M-N':
            s += tagger.x(i,0)
        elif tagger.y2(i) == 'E-N':
            s += tagger.x(i,0)
            l.append(s)
            s = ''
    print(st)
    print('\t')
    print('命名实体有：')
    return l
ner_f(s,tagger)


	
命名实体有：


[]

In [None]:
print("column size: " , tagger.xsize())
print("token size: " , tagger.size())
print("tag size: " , tagger.ysize())

print("tagset information:")
ysize = tagger.ysize()
for i in range(0, ysize-1):
    print("tag " , i , " " , tagger.yname(i))

# parse and change internal stated as 'parsed'
tagger.parse()    

print("conditional prob=" , tagger.prob(), " log(Z)=" , tagger.Z())
print('\t')

size = tagger.size()
xsize = tagger.xsize()
for i in range(0, size):
        for j in range(0, xsize):
              print( tagger.x(i, j) , "\t"), 
        print(i, "\t")
        print(tagger.y2(i) , "\t"),
        print ("Details"),
        for j in range(0, (ysize-1)):
              print ("\t" , tagger.yname(j) , "/prob=" ,tagger.prob(i,j), "/alpha=" ,tagger.alpha(i, j), "/beta=" ,tagger.beta(i, j)),
        print("\n"),

print ("nbest outputs:")
for n in range(0, 9):
    if (not tagger.next()):
        continue
    print("nbest n=" ,n , "\tconditional prob=" ,tagger.prob())
    # you can access any information using tagger.y()...
print("Done!")