In [1]:
""" 命名实体识别（Named Entity Recognition，简称NER）是信息提取、问答系统、句法分析、机器翻译等应用领域的重要基础工具，在自然语言处理技术走向实用化的过程中占有重要地位。
一般来说，命名实体识别的任务就是识别出待处理文本中三大类（实体类、时间类和数字类）、七小类（人名、机构名、地名、时间、日期、货币和百分比）命名实体。 """


' 命名实体识别（Named Entity Recognition，简称NER）是信息提取、问答系统、句法分析、机器翻译等应用领域的重要基础工具，在自然语言处理技术走向实用化的过程中占有重要地位。\n一般来说，命名实体识别的任务就是识别出待处理文本中三大类（实体类、时间类和数字类）、七小类（人名、机构名、地名、时间、日期、货币和百分比）命名实体。 '

In [8]:
# 用nltk实现NER
import re
import pandas as pd 
import nltk

# 将文章拆分为字符
def parse_document(document):
    document=re.sub('\n',' ',document)      #用空格代替换行
    if isinstance(document,str):
        # 判断document是否为str类型
        document=document
    else:
        raise ValueError('Document is not string!')
    # strip() 方法用于移除字符串头尾指定的字符（默认为空格）或字符序列
    document =document.strip()
    sentences=nltk.sent_tokenize(document)
    sentences=[sentence.strip() for sentence in sentences]
    return sentences

# sample document
text = """
FIFA was founded in 1904 to oversee international competition among the national associations of Belgium, 
Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its 
membership now comprises 211 national associations. Member countries must each also be members of one of 
the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America 
and the Caribbean, Oceania, and South America.
"""

# tokenize sentences
sentences = parse_document(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]



In [10]:

# extract all named entities
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
   for tagged_tree in ne_tagged_sentence:
       # extract only chunks having NE labels
       if hasattr(tagged_tree, 'label'):
           entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #get NE name
           entity_type = tagged_tree.label() # get NE category
           named_entities.append((entity_name, entity_type))
           # get unique named entities
           named_entities = list(set(named_entities))

# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# display results
print(entity_frame)

Entity Name   Entity Type
0       Switzerland           GPE
1             North           GPE
2            France           GPE
3             Spain           GPE
4            Sweden           GPE
5           Denmark           GPE
6           Belgium           GPE
7           Germany           GPE
8   Central America  ORGANIZATION
9              Asia           GPE
10      Netherlands           GPE
11          Oceania           GPE
12           Zürich           GPE
13             FIFA  ORGANIZATION
14           Africa        PERSON
15        Caribbean      LOCATION
16    South America           GPE
17           Europe           GPE


In [1]:
""" 用Stanford NLP工具实现NER """
import re
from nltk.tag import StanfordNERTagger
import os
import pandas as pd
import nltk

# 将文章拆分为字符
def parse_document(document):
    document=re.sub('\n',' ',document)      #用空格代替换行
    if isinstance(document,str):
        # 判断document是否为str类型
        document=document
    else:
        raise ValueError('Document is not string!')
    # strip() 方法用于移除字符串头尾指定的字符（默认为空格）或字符序列
    document =document.strip()
    sentences=nltk.sent_tokenize(document)
    sentences=[sentence.strip() for sentence in sentences]
    return sentences

# sample document
text = """
FIFA was founded in 1904 to oversee international competition among the national associations of Belgium, 
Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its 
membership now comprises 211 national associations. Member countries must each also be members of one of 
the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America 
and the Caribbean, Oceania, and South America.
"""


In [6]:
sentences = parse_document(text)
tokenized_sentences=[nltk.word_tokenize(sentence) for sentence in sentences]


# load stanford NER
sn = StanfordNERTagger('D://Files/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser.gz',
                       path_to_jar='D://Files/stanford-ner-4.0.0/stanford-ner.jar')

# tag sentences
ne_annotated_sentences = [sn.tag(sent) for sent in tokenized_sentences]
# print(ne_annotated_sentences)
# extract named entities
named_entities=[]
for sentence in ne_annotated_sentences:
    temp_entity_name=''
    temp_named_entity=None
    for term,tag in sentence:
        # get terms with NE tags
        if tag!='O':
            temp_entity_name=' '.join([temp_entity_name,term]).strip()  #get NE name
            temp_named_entity=(temp_entity_name,tag)    # get NE and its category
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name=''
                temp_named_entity=None

# get unique named entities
named_entities = list(set(named_entities))
# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# display results
print(entity_frame)

Entity Name   Entity Type
0                    Sweden      LOCATION
1                    Zürich      LOCATION
2                      Asia      LOCATION
3             South America      LOCATION
4                    France      LOCATION
5                   Germany      LOCATION
6                   Belgium      LOCATION
7                      1904          DATE
8                   Oceania      LOCATION
9           the Netherlands      LOCATION
10                     FIFA  ORGANIZATION
11                   Africa      LOCATION
12  North & Central America  ORGANIZATION
13                Caribbean      LOCATION
14              Switzerland      LOCATION
15                    Spain      LOCATION
16                  Denmark      LOCATION
17                   Europe      LOCATION


In [None]:
""" 可以看到，在Stanford NER的帮助下，NER的实现效果较好，将Africa识别为LOCATION，将1904识别为时间（这在NLTK中没有识别出来），但还是对North & Central America识别有误，将其识别为ORGANIZATION。
  值得注意的是，并不是说Stanford NER一定会比NLTK NER的效果好，两者针对的对象，预料，算法可能有差异，因此，需要根据自己的需求决定使用什么工具。

作者：山阴少年
链接：https://www.jianshu.com/p/16e1f6a7aaef
来源：简书
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。 """