In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [2]:
data = pd.read_csv('ner_data_v1.csv')

In [4]:
func = lambda temp: [(w, t) for w, t in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
tagged_sentences=[t for t in data.groupby("Sentence: #").apply(func)]
print("전체 샘플 개수: {}".format(len(tagged_sentences)))

전체 샘플 개수: 1767380


In [5]:
sentences, ner_tags = [], [] 
for tagged_sentence in tagged_sentences[:22000]:
    sentence, tag_info = zip(*tagged_sentence) 
    sentences.append(list(sentence)) 
    ner_tags.append(list(tag_info))

In [6]:
src_tokenizer = Tokenizer(num_words = 5000, oov_token='OOV', lower=False)
src_tokenizer.fit_on_texts(sentences)

tar_tokenizer = Tokenizer(lower=False) 
tar_tokenizer.fit_on_texts(ner_tags)

In [7]:
vocab_size = len(src_tokenizer.word_index) + 1
tag_size = len(tar_tokenizer.word_index) + 1

print('단어 집합의 크기 : {}'.format(vocab_size))
print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size))

단어 집합의 크기 : 136999
개체명 태깅 정보 집합의 크기 : 6


In [8]:
tar_tokenizer.word_index

{'O': 1, '인물': 2, '기관/집단': 3, '장소': 4, '정당': 5}

In [9]:
X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(ner_tags)

In [10]:
index_to_word = src_tokenizer.index_word
index_to_ner = tar_tokenizer.index_word
index_to_ner[0]='PAD'

In [11]:
print(index_to_ner)

{1: 'O', 2: '인물', 3: '기관/집단', 4: '장소', 5: '정당', 0: 'PAD'}


In [12]:
decoded = []
for index in X_train[0] : 
    decoded.append(index_to_word[index]) 

print('기존의 문장 : {}'.format(sentences[0]))
print('디코딩 문장 : {}'.format(decoded))

기존의 문장 : ['[', '국회', '초선이', '바꾼다', ']', '⑥', '미래통합당', '김웅', '(', '서울', '송파갑', ')', '김종인', '비대위', '전환', '불가피', '판단', '“', '한국당엔', '정공법으로', '”', '합당', '지지', '“', '청년이', '주인', '되는', '정당', '만들고파', '”', '김웅', '미래통합당', '당선인']
디코딩 문장 : ['[', '국회', 'OOV', 'OOV', ']', 'OOV', 'OOV', 'OOV', '(', '서울', 'OOV', ')', '김종인', 'OOV', 'OOV', 'OOV', '판단', '“', 'OOV', 'OOV', '”', 'OOV', '지지', '“', 'OOV', 'OOV', '되는', '정당', 'OOV', '”', 'OOV', 'OOV', 'OOV']


In [13]:
max_len = 128
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.2, random_state=777)

In [15]:
y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)

In [16]:
print('훈련 샘플 문장의 크기 : {}'.format(X_train.shape))
print('훈련 샘플 레이블의 크기 : {}'.format(y_train.shape))
print('테스트 샘플 문장의 크기 : {}'.format(X_test.shape))
print('테스트 샘플 레이블의 크기 : {}'.format(y_test.shape))

훈련 샘플 문장의 크기 : (41600, 128)
훈련 샘플 레이블의 크기 : (41600, 128, 6)
테스트 샘플 문장의 크기 : (10400, 128)
테스트 샘플 레이블의 크기 : (10400, 128, 6)


In [17]:
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

Using TensorFlow backend.


In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=20, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(50, activation="relu")))
crf = CRF(tag_size)
model.add(crf)

In [None]:
#model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1, verbose=1)

Train on 37440 samples, validate on 4160 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [30]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def sequences_to_tag(sequences): # 예측값을 index_to_tag를 사용하여 태깅 정보로 변경하는 함수.
    result = []
    for sequence in sequences: # 전체 시퀀스로부터 시퀀스를 하나씩 꺼낸다.
        temp = []
        for pred in sequence: # 시퀀스로부터 예측값을 하나씩 꺼낸다.
            pred_index = np.argmax(pred) # 예를 들어 [0, 0, 1, 0 ,0]라면 1의 인덱스인 2를 리턴한다.
            temp.append(index_to_ner[pred_index].replace("PAD", "O")) # 'PAD'는 'O'로 변경
        result.append(temp)
    return result

y_predicted = model.predict(X_test)
pred_tags = sequences_to_tag(y_predicted)
test_tags = sequences_to_tag(y_test)

print(classification_report(test_tags, pred_tags))

           precision    recall  f1-score   support

       인물       0.95      0.94      0.95      5962
       장소       0.96      0.83      0.89      1661
    기관/집단       0.90      0.84      0.87      1987
       정당       1.00      0.98      0.99       579

micro avg       0.95      0.90      0.93     10189
macro avg       0.95      0.90      0.92     10189



In [31]:
from bs4 import BeautifulSoup
from requests import get

url = 'https://search.naver.com/search.naver?sm=tab_hty.top&where=news&query=윤미향+위안부'
response = get(url)

url_list = []
html_soup = BeautifulSoup(response.text, 'html.parser')
for  a in html_soup.find_all('a', class_ = ' _sp_each_title'):
    url_list.append(a['href'])

In [32]:
from sklearn.cluster import KMeans
from newspaper import Article
from nltk.tokenize import word_tokenize

news = []
for url in url_list:

    article = Article(url)
    article.download()
    article.parse()
    news.append(' '.join(word_tokenize(article.text)).replace('?','.').replace('!','.').split('.'))

news = [n for n in news if len(n)>1]

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

for text in news:
    sentences= text
    entity = []

    for sent in sentences:
        new_sentence = sent.split()

        word_to_index = src_tokenizer.word_index
        new_X=[]
        for w in new_sentence:
            try:
                new_X.append(word_to_index.get(w,1))
            except KeyError:
                new_X.append(word_to_index['OOV'])

        pad_new = pad_sequences([new_X], padding="post", value=0, maxlen=128)
        p = model.predict(np.array([pad_new[0]]))
        p = np.argmax(p, axis=-1)

        for w, pred in zip(new_sentence, p[0]):
            label = tar_tokenizer.index_word[pred]
            if label != 'O':
                entity.append((w, label))

    print('인물: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and len(x[0])<=3 and x[1]=='인물'])))))
    print('정당: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='정당'])))))
    print('기관/집단: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='기관/집단'])))))
    print('장소: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='장소'])))))

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from newspaper import Article

article = Article('https://news.chosun.com/site/data/html_dir/2020/06/12/2020061202622.html')
article.download()
article.parse()
text = article.text
sentences = ' '.join(word_tokenize(text)).replace('?', '.').replace('!', '.').split('.')
entity = []

for sent in sentences:
    new_sentence = sent.split(' ')
    
    word_to_index = src_tokenizer.word_index
    new_X=[]
    for w in new_sentence:
        try:
            new_X.append(word_to_index.get(w,1))
        except KeyError:
            new_X.append(word_to_index['OOV'])
            
    pad_new = pad_sequences([new_X], padding="post", value=0, maxlen=max_len)
    p = model.predict(np.array([pad_new[0]]))
    p = np.argmax(p, axis=-1)

    for w, pred in zip(new_sentence, p[0]):
        label = tar_tokenizer.index_word[pred]
        if label != 'O':
            entity.append((w, label))
            
print(set(entity))
#print(set([x for x in entity if entity.count(x)>1]))

In [31]:
from bs4 import BeautifulSoup
from requests import get

url = 'https://search.naver.com/search.naver?sm=tab_hty.top&where=news&query=자사고+폐지'
response = get(url)

url_list = []
html_soup = BeautifulSoup(response.text, 'html.parser')
for  a in html_soup.find_all('a', class_ = ' _sp_each_title')[:4]:
    url_list.append(a['href'])

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from newspaper import Article

entity = []
for url in url_list:

    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    sentences = ' '.join(word_tokenize(text)).replace('?', '.').replace('!', '.').split('.')

    for sent in sentences:
        new_sentence = sent.split(' ')

        word_to_index = src_tokenizer.word_index
        new_X=[]
        for w in new_sentence:
            try:
                new_X.append(word_to_index.get(w,1))
            except KeyError:
                new_X.append(word_to_index['OOV'])

        pad_new = pad_sequences([new_X], padding="post", value=0, maxlen=max_len)
        p = model.predict(np.array([pad_new[0]]))
        p = np.argmax(p, axis=-1)

        for w, pred in zip(new_sentence, p[0]):
            label = tar_tokenizer.index_word[pred]
            if label != 'O':
                entity.append((w, label))

print('인물: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='인물'])))))
print('직책: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='직책'])))))
print('정당: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='정당'])))))
print('기관/집단: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='기관/집단'])))))
print('장소: {}'.format(', '.join(list(set([x[0] for x in set(entity) if len(x[0])>1 and x[1]=='장소'])))))

In [102]:
import pickle

# saving
with open('politics_ner_src_tokenizer_v1.pickle', 'wb') as handle:
    pickle.dump(src_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [103]:
# saving
with open('politics_ner_tar_tokenizer_v1.pickle', 'wb') as handle:
    pickle.dump(tar_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [104]:
model.save("ner_model_v1.h5")