## 英文 NER
### 參考資料
- https://segmentfault.com/a/1190000017015436
- https://medium.com/%E6%88%91%E5%B0%B1%E5%95%8F%E4%B8%80%E5%8F%A5-%E6%80%8E%E9%BA%BC%E5%AF%AB/named-entity-recognition-%E5%91%BD%E5%90%8D%E5%AF%A6%E9%AB%94%E8%AD%98%E5%88%A5-309e97823a43

In [34]:
# 正則表達處理套件
import re

import pandas as pd
import numpy as np
# Natural Language Tool Kit
import nltk


def parse_document(document):
   document = re.sub('\n', ' ', document)
   if isinstance(document, str):
       document = document
   else:
       raise ValueError('Document is not string!')
   document = document.strip() # 去頭尾空白
   sentences = nltk.sent_tokenize(document) # 斷句
   sentences = [sentence.strip() for sentence in sentences]
   return sentences

# sample document
text = """
FIFA was founded in 1904 to oversee international competition among the national associations of Belgium, 
Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its 
membership now comprises 211 national associations. Member countries must each also be members of one of 
the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America 
and the Caribbean, Oceania, and South America.
"""
# tokenize sentences
sentences = parse_document(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]
# extract all named entities
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
   for tagged_tree in ne_tagged_sentence:
       # extract only chunks having NE labels
       if hasattr(tagged_tree, 'label'):
           entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #get NE name
           entity_type = tagged_tree.label() # get NE category
           named_entities.append((entity_name, entity_type))
           # get unique named entities
           named_entities = list(set(named_entities))

# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# display results
print(entity_frame)

        Entity Name   Entity Type
0            Africa        PERSON
1   Central America  ORGANIZATION
2              Asia           GPE
3            Zürich           GPE
4             North           GPE
5     South America           GPE
6              FIFA  ORGANIZATION
7             Spain           GPE
8           Denmark           GPE
9       Switzerland           GPE
10           Sweden           GPE
11           France           GPE
12           Europe           GPE
13          Oceania           GPE
14      Netherlands           GPE
15          Belgium           GPE
16        Caribbean      LOCATION
17          Germany           GPE
