In [None]:
# 토큰화

In [20]:
# 1. (영어) 단어 토큰화

# 패키지 불러들이기
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [23]:
# 단어 토큰화 방법
text = """
Don't be fooled by the dark sounding name.
Mr.Jone's Orphange is as cheery as cheery goes
for a pastry shop.
"""

In [21]:
# 작은 따옴표가 있는 하나의 단어를 따로 분리
# Don't는 하나의 단어로 취급해야함
word_token = word_tokenize(text)
print(word_token)

['String', 'a', 'home-based', '.', 'Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', '.', 'Mr.Jone', "'s", 'Orphange', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [24]:
word_token = wordpunct_tokenize(text)
print(word_token)

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', '.', 'Mr', '.', 'Jone', "'", 's', 'Orphange', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [9]:
# 케라스 라이브러리 사용
word_token = text_to_word_sequence(text)
print(word_token)

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphange', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


In [15]:
# 표준토큰화 방법
from nltk.tokenize import TreebankWordTokenizer

In [25]:
text = """
String a home-based. Don't be fooled by the dark sounding name.
Mr.Jone's Orphange is as cheery as cheery goes
for a pastry shop.
"""

In [26]:
# <TreebankWordTokenizer>
# 표준으로 쓰이는 토큰화 방법중 하나
# 아래 규칙을 준수할 수 있음
# 1. 하이푼(-)으로 구성된 단어는 하나로 유지
# 2. Don't와 같이 작은 따옴표로 합쳐진 단어는 분리

tokenizer = TreebankWordTokenizer()
word_token = tokenizer.tokenize(text)
print(word_token)

['String', 'a', 'home-based.', 'Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name.', 'Mr.Jone', "'s", 'Orphange', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [28]:
# 2. (영어) 문장 토큰화

from nltk.tokenize import sent_tokenize

text = """His barber kept his word. But keeping such a huge secret to himself was
driving him crazy. Finally, the barber went up a mountain and almost to the edge of
a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure
 no one was near."""

# 마침표 기준으로 분리시킴
sent_token = sent_tokenize(text)
print(sent_token)

['His barber kept his word.', 'But keeping such a huge secret to himself was\ndriving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of\na cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure\n no one was near.']


In [29]:
text = """I am actively looking for Ph.D. student.
His barber kept his word. But keeping such a huge secret to himself was
driving him crazy. Finally, the barber went up a mountain and almost to the edge of
a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure
 no one was near."""

# 마침표 기준으로 분리시킴
sent_token = sent_tokenize(text)
print(sent_token)

['I am actively looking for Ph.D. student.', 'His barber kept his word.', 'But keeping such a huge secret to himself was\ndriving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of\na cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure\n no one was near.']


In [33]:
# kss 사용 패키지 : pip install mecab kss
import kss

# <kss(Korean Sentence Splitter)>
# 한국어 형태소 분석기
# kss는 C++프로그래밍으로 만들어져 있음
# 빠른 처리 속도를 위해 mecab를 함께 설치 사용

In [40]:
text = """딥 러닝 자연어 처리가 재미있기는 합니다. 정말? 그런데 문제는 
영어보다 한국어로 할 때 너무 어렵습니다. 이제 해보죠~! 어렵네요."""

In [41]:
# 
kss_token = kss.split_sentences(text)
print(kss_token)

['딥 러닝 자연어 처리가 재미있기는 합니다.', '정말? 그런데 문제는 \n영어보다 한국어로 할 때 너무 어렵습니다.', '이제 해보죠~!', '어렵네요.']


In [46]:
# NLTK 품사 토큰화하기

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [43]:
text = """I am actively looking for Ph.D.
students. and you are a Ph.D. student."""

In [44]:
# 단어 토큰화 하기
tokenized_sentence = word_tokenize(text)
print(tokenized_sentence)

['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']


In [49]:
# 품사 토큰화
pos_token = pos_tag(tokenized_sentence)
print(pos_token)

[('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('and', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN'), ('.', '.')]


In [48]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [None]:
# PRP : 인칭대명사
# VBP : 동사
# RB : 부사
# VBG : 현재부사
# IN : 전치사
# NNP: 고유 명사
# NNS : 복수형 명사
# CC : 접속사
# DT : 관사

In [50]:
from konlpy.tag import Okt

In [53]:
text = """열심히 코딩한 당신, 연휴에는 여행을 가보아요."""

In [54]:
okt = Okt()
morphs_okt = okt.morphs(text)
print(morphs_okt)

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가보아요', '.']


In [55]:
# 품사 태깅 : 형태소와 품사를 포함해 분류
post_okt = okt.pos(text)
print(post_okt)

[('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가보아요', 'Verb'), ('.', 'Punctuation')]


In [56]:
# 명사 추출
nouns_okt = okt.nouns(text)
print(nouns_okt)

['코딩', '당신', '연휴', '여행']


In [57]:
from konlpy.tag import Kkma

In [58]:
kkma = Kkma()

# 형태소 분석
morphs_kkma = kkma.morphs(text)
print(morphs_kkma)

['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요', '.']


In [59]:
# 품사 태깅
pos_kkma = kkma.pos(text)
print(pos_kkma)

[('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN'), ('.', 'SF')]


In [60]:
# 명사 추출
nouns_kkma = kkma.nouns(text)
print(nouns_kkma)

['코딩', '당신', '연휴', '여행']
