<br></br>
# **Token의 활용**

## **1 Token**
어휘분석

In [1]:
# ! pip3 install nltk

In [2]:
# import nltk
# nltk.download('punkt')

In [3]:
text = """오늘 금요일, 오후.
오늘 조금만 버티면 주말입니다. 조금만 힘내세요"""

In [4]:
from nltk import sent_tokenize
sent_tokenize(text)

['오늘 금요일, 오후.', '오늘 조금만 버티면 주말입니다.', '조금만 힘내세요']

In [5]:
from nltk import word_tokenize
text = word_tokenize(text)
text

['오늘', '금요일', ',', '오후', '.', '오늘', '조금만', '버티면', '주말입니다', '.', '조금만', '힘내세요']

In [6]:
from nltk import FreqDist
dict(FreqDist(text))

{'오늘': 2,
 '금요일': 1,
 ',': 1,
 '오후': 1,
 '.': 2,
 '조금만': 2,
 '버티면': 1,
 '주말입니다': 1,
 '힘내세요': 1}

<br></br>
## **2 Re 를 사용한 Regex 정규식**

In [7]:
text = """Park 010-1234-1234 Kim 010-8888-9999 
Lee 010-2123-1299 한남충 010-222-9999 메갈녀 010-555-2345"""

from nltk.tokenize import RegexpTokenizer
re_capt = RegexpTokenizer(r'\d+')
re_capt.tokenize(text)

['010',
 '1234',
 '1234',
 '010',
 '8888',
 '9999',
 '010',
 '2123',
 '1299',
 '010',
 '222',
 '9999',
 '010',
 '555',
 '2345']

In [8]:
from nltk.tokenize import RegexpTokenizer
re_capt = RegexpTokenizer(r'[A-z]\w+')
re_capt.tokenize(text)

['Park', 'Kim', 'Lee']

In [9]:
from nltk.tokenize import RegexpTokenizer
re_capt = RegexpTokenizer(r'[가-힣]\w+')
re_capt.tokenize(text)

['한남충', '메갈녀']

In [10]:
# 한글과 영문 함께 이름만 추출기 [A-z가-힣]
# 아이디어 1 : 숫자가 아닌 내용만 추출한다
# 아이디어 2 : 한글과 영어만 추출한다

<br></br>
# **Stemming / Tagging**

## **1 TreebankWordTokenizer**

In [11]:
# import nltk
# nltk.download('averaged_perceptron_tagger')

In [12]:
# Penn Treebank Corpus 에 따른 기준을 사용하여, 문법별로 나눈다
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [13]:
%%time
text = " Don't hesitate to ask questions"
token     = tokenizer.tokenize(text)
print(token)

['Do', "n't", 'hesitate', 'to', 'ask', 'questions']
CPU times: user 179 µs, sys: 62 µs, total: 241 µs
Wall time: 248 µs


In [14]:
from nltk import pos_tag
pos_tag(token)

[('Do', 'VBP'),
 ("n't", 'RB'),
 ('hesitate', 'VB'),
 ('to', 'TO'),
 ('ask', 'VB'),
 ('questions', 'NNS')]

<br></br>
## **2 WordPunctTokenizer**

In [15]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

In [16]:
%%time
token     = tokenizer.tokenize(text)
print(token)

['Don', "'", 't', 'hesitate', 'to', 'ask', 'questions']
CPU times: user 349 µs, sys: 114 µs, total: 463 µs
Wall time: 470 µs


In [17]:
from nltk import pos_tag
pos_tag(token)

[('Don', 'NNP'),
 ("'", 'POS'),
 ('t', 'NN'),
 ('hesitate', 'NN'),
 ('to', 'TO'),
 ('ask', 'VB'),
 ('questions', 'NNS')]

<br></br>
## **3 Tag 설명**

In [18]:
# import nltk
# nltk.download('tagsets')

In [19]:
import nltk.help as nltk_help
nltk_help.upenn_tagset('PRP')  # 대명사

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [20]:
nltk_help.upenn_tagset('JJ')  # 형용사

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
