# 문장 토큰화 Sentence Tokenization

In [1]:
from nltk.tokenize import sent_tokenize

In [3]:
import nltk

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\semin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
text = "My email address is 'abcde@codeit.com'. Send it to Mr.Kim."

In [7]:
tokenized_sents = sent_tokenize(text)
tokenized_sents

["My email address is 'abcde@codeit.com'.", 'Send it to Mr.Kim.']

In [8]:
text = "Can you forward my email to Mr.Kim? Thank you!"

In [9]:
tokenized_sents = sent_tokenize(text)
tokenized_sents

['Can you forward my email to Mr.Kim?', 'Thank you!']

# 품사 태깅 Part of Speech Tagging (POS)
문장에 사용된 단어의 의미를 제대로 파악하려면 해당 단어가 어떤 품사로 사용되었는지 함께 알아야 합니다.  
품사 태깅(POS; Part of Speech Tagging) : 각 단어가 어떤 품사로 쓰였는지 표시하는 작업

In [10]:
from nltk.tag import pos_tag

In [11]:
text = "Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \"Hey, let\'s pool our money together and make a really bad movie!\" Or something like that."
pos_tagged_words = []

In [13]:
#문장 토큰화
tokenized_sents = sent_tokenize(text)
tokenized_sents

['Watching Time Chasers, it obvious that it was made by a bunch of friends.',
 'Maybe they were sitting around one day in film school and said, "Hey, let\'s pool our money together and make a really bad movie!"',
 'Or something like that.']

In [15]:
from nltk.tokenize import word_tokenize

In [19]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\semin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [23]:
for sentence in tokenized_sents:
    #단어 토큰화
    tokenized_words = word_tokenize(sentence)
    
    #품사 태깅
    pos_tagged = pos_tag(tokenized_words)
    pos_tagged_words.extend(pos_tagged)

In [24]:
pos_tagged_words

[('Watching', 'VBG'),
 ('Time', 'NNP'),
 ('Chasers', 'NNPS'),
 (',', ','),
 ('it', 'PRP'),
 ('obvious', 'VBZ'),
 ('that', 'IN'),
 ('it', 'PRP'),
 ('was', 'VBD'),
 ('made', 'VBN'),
 ('by', 'IN'),
 ('a', 'DT'),
 ('bunch', 'NN'),
 ('of', 'IN'),
 ('friends', 'NNS'),
 ('.', '.'),
 ('Maybe', 'RB'),
 ('they', 'PRP'),
 ('were', 'VBD'),
 ('sitting', 'VBG'),
 ('around', 'IN'),
 ('one', 'CD'),
 ('day', 'NN'),
 ('in', 'IN'),
 ('film', 'NN'),
 ('school', 'NN'),
 ('and', 'CC'),
 ('said', 'VBD'),
 (',', ','),
 ('``', '``'),
 ('Hey', 'NNP'),
 (',', ','),
 ('let', 'VB'),
 ("'s", 'POS'),
 ('pool', 'VB'),
 ('our', 'PRP$'),
 ('money', 'NN'),
 ('together', 'RB'),
 ('and', 'CC'),
 ('make', 'VB'),
 ('a', 'DT'),
 ('really', 'RB'),
 ('bad', 'JJ'),
 ('movie', 'NN'),
 ('!', '.'),
 ("''", "''"),
 ('Or', 'CC'),
 ('something', 'NN'),
 ('like', 'IN'),
 ('that', 'DT'),
 ('.', '.')]

## Penn Treebank POS Tags
NLTK의 pos_tag() 함수는 Penn Treebank POS Tags를 기준으로 품사를 태깅합니다. 

# 표제어 추출(Lemmatization)
* 표제어 Lemma : 단어의 사전적 어원  
서로 다른 단어도 표제어는 같은 경우 존재. 표제어를 기준으로 통합하면 단어가 정규화됩니다.   
* 예를 들어 am, are, is는 서로 다른 단어이지만 표제어는 동일하게 be입니다

In [25]:
text = 'You are the happiest person.'

In [29]:
tokenized_words = word_tokenize(text)
tokenized_words

['You', 'are', 'the', 'happiest', 'person', '.']

In [30]:
tagged_words = pos_tag(tokenized_words)
tagged_words

[('You', 'PRP'),
 ('are', 'VBP'),
 ('the', 'DT'),
 ('happiest', 'JJS'),
 ('person', 'NN'),
 ('.', '.')]

표제어 추출에 사용되는 함수는 WordNet POS Tag를 사용  
* WordNet POS Tag : WordNet이란 거대한 영어 어휘 데이터베이스에 적용되어 있는 품사 태그  
* pos_tag()로 태깅한 품사를 WordNet POS Tag에 맞게 변환
* words_lemmatizer()는 (단어, 품사) 형태로 품사 태깅된 리스트를 파라미터로 받고, 표제어 추출이 된 결과를 반환합니다.

In [31]:
from nltk.corpus import wordnet as wn

In [32]:
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    else:
        return

In [33]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\semin\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\semin\AppData\Roaming\nltk_data...


True

In [34]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = []

In [36]:
for word,tag in tagged_words:
    wn_tag = penn_to_wn(tag) #wordnet pos tag변환
    
    if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
    else:
        lemmatized_words.append(word)

In [37]:
lemmatized_words

['You', 'be', 'the', 'happy', 'person', '.']

In [38]:
# 표제어 추출 확인
print('표제어 추출 전 :', tokenized_words)
print('표제어 추출 후 :', lemmatized_words)

표제어 추출 전 : ['You', 'are', 'the', 'happiest', 'person', '.']
표제어 추출 후 : ['You', 'be', 'the', 'happy', 'person', '.']
