<a href="https://colab.research.google.com/github/bjh5098/Social-Network-Analysis-and-Text-Mining/blob/master/Textmining_4_PoS_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Spacy 품사 태깅(PoS tagging)

In [0]:
import spacy

In [0]:
nlp = spacy.load('en')

In [0]:
sent_0 = nlp(u'Mathieu and I went to the park.')
sent_1 = nlp(u'If Clement was asked to take out the garbage, he would refuse.')
sent_2 = nlp(u'Baptiste was in charge of the refuse treatment center.')
sent_3 = nlp(u'Marie took out her rather suspicious and fishy cat to fish for fish.')

In [0]:
for token in sent_0:
    print(token.text, token.pos_, token.tag_)

for token in sent_1:
    print(token.text, token.pos_, token.tag_)

for token in sent_2:
    print(token.text, token.pos_, token.tag_)

for token in sent_3:
    print(token.text, token.pos_, token.tag_)

In [0]:
sent_list = [sent_0, sent_1, sent_2, sent_3]

In [0]:
for i, sent in enumerate(sent_list):
    print('sent{}:---------------------------------------------------'.format(i))    
    for token in sent:
        print(token.text, token.pos_, token.tag_)    

*   Refuse : verb 동사
*   Clement: 치우기(take out)를 거부(refuse)하는 대상(object)

In [0]:
for token in sent_1:
    print(token.text, token.pos_, token.tag_)

In [0]:
from spacy import displacy
displacy.render(sent_1, style='dep', jupyter=True)

*  fishy adj 형용사
*  to fish verb 동사
*  for fish noun 명사

In [0]:
for token in sent_3:
    print(token.text, token.pos_, token.tag_)

In [0]:
from spacy import displacy
displacy.render(sent_3, style='dep', jupyter=True)

## Spacy 품사 태거 학습

In [0]:
import random
import spacy

***Tag 정의***

In [0]:
TAG_MAP = {
    'N': {'pos': 'NOUN'},
    'V': {'pos': 'VERB'},
    'J': {'pos': 'ADJ'}
}

***훈련 데이터***

In [0]:
TRAIN_DATA = [
    ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
    ("Eat blue ham", {'tags': ['V', 'J', 'N']})]

***Blank 언어 모델 정의, 품사태거 추가***

In [0]:
nlp = spacy.blank('en')
tagger = nlp.create_pipe('tagger')

***학습용 라벨을 품사 태거에 추가***

In [0]:
for tag, values in TAG_MAP.items():
    tagger.add_label(tag, values)
nlp.add_pipe(tagger)

***품사 태거 학습***

In [0]:
optimizer = nlp.begin_training()

for i in range(25):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer, losses=losses)
    print(losses)

***학습한 품사 태거 테스트***

In [0]:
test_text = "I like blue eggs"
doc = nlp(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])

***학습한 품사 태거 저장***

In [0]:
nlp.to_disk('/content')

***학습한 품사 태거 불러오기 및 사용***

In [0]:
nlp2 =spacy.load('/content')
doc2 = nlp2(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc2])

#nltk 품사 태깅(PoS tagging)

In [0]:
from nltk import sent_tokenize, word_tokenize

***corpus 다운로드***

In [0]:
nltk.download('punkt')

In [0]:
nltk.download('averaged_perceptron_tagger')

***문장 토큰화***

In [0]:
text = word_tokenize("And now for something completely different")

In [0]:
print(text)

***PoS tagging***

In [0]:
nltk.pos_tag(text)

***nltk 품사 태깅 참조: - https://www.nltk.org/book/ch05.html***

#Spacy 품사 태그 활용 예

##문장 특정 품사(동사, VERB)만 찾아서 대문자로 변환하기

In [0]:
def make_verb_upper(text, pos):
    if pos == 'VERB':
        return text.upper()
    else:
        return text

In [0]:
doc = nlp('Tom ran swiftly and walked slowly')

In [0]:
text = ''.join(make_verb_upper(w.text_with_ws, w.pos_) for w in doc)

In [0]:
print(text)

##문장 내 특정 품사(명사, NOUN) 추출하기

In [0]:
def extract_noun(text, pos):        
    if pos == 'NOUN':
        return text

In [0]:
doc = nlp('Marie took out her rather suspicious and fishy cat to fish for fish.')

In [0]:
for token in doc:
    print(token.text, token.pos_, token.tag_)

In [0]:
noun_list = []

for w in doc:
    temp = extract_noun(w.text_with_ws, w.pos_)
    if temp != None:
        noun_list.append(temp.strip())

In [0]:
noun_list

##특정 품사의 발생 횟수 확인 하기

In [0]:
import pandas as pd


In [0]:
harry_potter = open("HP1.txt").read()

In [0]:
print(harry_potter)

In [0]:
import spacy

In [0]:
nlp = spacy.load('en')
hp = nlp(harry_potter)

***문장별 단어 수 확인***

In [0]:
hpSents[0]

In [0]:
len(hpSents[0])

In [0]:
hpSents = list(hp.sents)
hpSentenceLengths = [len(sent) for sent in hpSents]

In [0]:
hpSentenceLengths

***가장 긴 문장 찾기***

In [0]:
[sent for sent in hpSents if len(sent) == max(hpSentenceLengths)]

In [0]:
hpPOS = pd.Series(hp.count_by(spacy.attrs.POS))/len(hp)

In [0]:
print(hpPOS)

In [0]:
tagDict = {w.pos: w.pos_ for w in hp}

In [0]:
print(tagDict)

In [0]:
df = pd.DataFrame([hpPOS], index=['Harry Potter'])

In [0]:
df.columns = [tagDict[column] for column in df.columns]

In [0]:
print(df)

In [0]:
df.T.plot(kind='bar')

In [0]:
fig_size = plt.rcParams["figure.figsize"]
fig_size = [12, 8]
# fig_size[0] = 8
# fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size

In [0]:
df.T.plot(kind='bar')

***품사별 빈도 출력***

In [0]:
hpPRON = [w for w in hp if w.pos_=='PRON']

In [0]:
from collections import Counter

In [0]:
Counter([w.string.strip() for w in hpPRON]).most_common(10)