<a href="https://colab.research.google.com/github/bjh5098/Social-Network-Analysis-and-Text-Mining/blob/master/Textmining_5_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Spacy를 활용한 NER

## NER 기초

In [0]:
import spacy

In [0]:
sentence = 'Rome is the capital of Italy'

In [0]:
nlp = spacy.load('en')

In [0]:
doc = nlp(sentence)

In [0]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [0]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

## NER 태깅

In [0]:
import spacy
nlp = spacy.load('en')

In [0]:
sent_0 = nlp(u'Donald Trump visited at the government headquarters in France today.')
sent_1 = nlp(u'Emmanuel Jean-Michel Frédéric Macron is a French politician serving as President of France and ex officio Co-Prince of Andorra since 14 May 2017.')
sent_2 = nlp(u'He studied philosophy at Paris Nanterre University, completed a Master’s of Public Affairs at Sciences Po, and graduated from the École nationale d\'administration (ÉNA) in 2004. ')
sent_3 = nlp(u'He worked at the Inspectorate General of Finances, and later became an investment banker at Rothschild & Cie Banque.')

###***sent_0***

In [0]:
for token in sent_0:
    print(token.text, token.ent_type_)

In [0]:
for ent in sent_0.ents:
    print(ent.text, ent.label_)

###***sent_1***

In [0]:
for token in sent_1:
    print(token.text, token.ent_type_)

In [0]:
for ent in sent_1.ents:
    print(ent.text, ent.label_)

###***sent_2***

In [0]:
for token in sent_2:
    print(token.text, token.ent_type_)

In [0]:
for ent in sent_2.ents:
    print(ent.text, ent.label_)

###***sent_3***

In [0]:
for token in sent_3:
    print(token.text, token.ent_type_)

In [0]:
for ent in sent_3.ents:
    print(ent.text, ent.label_)

## 개체명 인식기 학습

In [0]:
import random
from pathlib import Path
import spacy

In [0]:
# 개체명 인식기 학습용 데이터
TRAIN_DATA = [
    ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

In [0]:
def main(model=None, output_dir='/content', n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # 기존 spacy 개체명 인식 모델 로드
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # 비어 있는 새로운 spacy 언어 모델 생성
        print("Created blank 'en' model")

    """ 내장 파이프라인 구성 요소를 생성하고 파이프라인에 추가, 'ner'이 없는 경우
         spacy에 등록된 내장 기능을 위해 동작 """
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # 개체명 라벨을 추가하기 위해 'ner' 불러오기
    else:
        ner = nlp.get_pipe('ner')

    # 개체명 라벨 추가
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # 개체명 인식기의 학습 동안 사용하지 않을 파이프라인 비활성화
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # NER 만 훈련
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # 텍스트 batch
                    [annotations],  # 개체명 뭉치
                    drop=0.5,  # 드롭아웃 비율
                    sgd=optimizer,  # 옵티마이저 함수
                    losses=losses)
            print(losses)

    # 훈련 모델의 테스트
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # 모델 저장
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [0]:
main()

In [0]:
# 저장된 개체명 인식 모델 불러오기, 활용
print("Loading from", '/content')
nlp2 = spacy.load('/content')
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

##개체명 인식기 예제 및 시각화

###Elom Must

In [0]:
Elon = "Elon Musk apparently wasn’t aware that his company SpaceX had a Facebook page. The SpaceX and Tesla CEO has responded to a comment on Twitter calling for him to take down the SpaceX, Tesla and Elon Musk official pages in support of the #deletefacebook movement by first acknowledging he didn’t know one existed, and then following up with promises that he would indeed take them down. He’s done just that, as the SpaceX Facebook page is now gone, after having been live earlier today (as you can see from the screenshot included taken at around 12:10 PM ET)."

In [0]:
Elon

In [0]:
import spacy
nlp = spacy.load('en')

In [0]:
doc = nlp(Elon)

In [0]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [0]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

###Wiki - Band

In [0]:
wiki = open("wiki_band.txt", encoding='utf-8').read()

In [0]:
wiki

In [0]:
doc = nlp(wiki)

In [0]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [0]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

#NLTK를 활용한 NER

In [0]:
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.chunk import conlltags2tree, tree2conlltags

In [0]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [0]:
sentence = "Clement and Mathieu are working at Apple."
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
 
iob_tagged = tree2conlltags(ne_tree)
print(iob_tagged)

In [0]:
ne_tree = conlltags2tree(iob_tagged)
print(ne_tree)

In [0]:
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('/content/english.all.3class.distsim.crf.ser.gz',  '/content/stanford-ner.jar', encoding='utf-8')

In [0]:
st.tag('Baptiste Capdeville is studying at Columbia University in NY'.split())