# **Chapter 1 자연어 다루기**
## **Token:** 어휘 추출하기

In [1]:
# ! apt-get update
# ! apt-get install g++ openjdk-8-jdk 
# ! pip3  install  nltk konlpy wordcloud matplotlib gensim 

In [2]:
# pip3 install nltk
import nltk
# nltk.download('punkt')
# nltk.download('tagsets')
# nltk.download('averaged_perceptron_tagger')
text_eng  = " Don't hesitate to ask questions"
text_kor  = """삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. 
홍보:유관순 010-8888-9999 031-444-5555.
삼성 페이지 https://www.samsung.com/sec/index.html"""

text_kor

'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \n홍보:유관순 010-8888-9999 031-444-5555.\n삼성 페이지 https://www.samsung.com/sec/index.html'

In [3]:
from nltk import sent_tokenize, word_tokenize, FreqDist
sent_tokenize(text_kor)

['삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다.',
 '홍보:유관순 010-8888-9999 031-444-5555.',
 '삼성 페이지 https://www.samsung.com/sec/index.html']

In [4]:
tokens = word_tokenize(text_kor)
tokens

['삼성',
 '갤럭시',
 '(',
 'GalaxyNote',
 ')',
 '노트의',
 '신형을',
 '홍보',
 '합니다',
 '.',
 '홍보',
 ':',
 '유관순',
 '010-8888-9999',
 '031-444-5555',
 '.',
 '삼성',
 '페이지',
 'https',
 ':',
 '//www.samsung.com/sec/index.html']

In [5]:
dict(FreqDist(tokens))

{'삼성': 2,
 '갤럭시': 1,
 '(': 1,
 'GalaxyNote': 1,
 ')': 1,
 '노트의': 1,
 '신형을': 1,
 '홍보': 2,
 '합니다': 1,
 '.': 2,
 ':': 2,
 '유관순': 1,
 '010-8888-9999': 1,
 '031-444-5555': 1,
 '페이지': 1,
 'https': 1,
 '//www.samsung.com/sec/index.html': 1}

<br/>
## **Re 를 사용한 Regex 정규식**
https://news.v.daum.net/v/20190223110230553

In [11]:
import re
tokenizer = re.compile(r'[ㄱ-힣]+')
tokenizer.findall(text_kor)

['삼성', '갤럭시', '노트의', '신형을', '홍보', '합니다', '홍보', '유관순', '삼성', '페이지']

In [9]:
tokenizer = re.compile(r'[0-9]{3}-[0-9]{3,4}-\d{4}')
tokenizer.findall(text_kor)

['010-8888-9999', '031-444-5555']

In [8]:
tokenizer = re.compile(r'\d{3}-\d{3,4}-\d{4}')
tokenizer.findall(text_kor)

['010-8888-9999', '031-444-5555']

In [12]:
tokenizer = re.compile(r'[^ 가-힣]+')
tokenizer.findall(text_kor)

['(GalaxyNote)',
 '.',
 '\n',
 ':',
 '010-8888-9999',
 '031-444-5555.\n',
 'https://www.samsung.com/sec/index.html']

In [13]:
tokenizer.sub("", text_kor)#.split(" ")

'삼성 갤럭시노트의 신형을 홍보 합니다 홍보유관순  삼성 페이지 '

In [14]:
tokenizer = re.compile(r'https://[w]{3}.[A-z]+.[./A-z]+')
tokenizer.findall(text_kor)

['https://www.samsung.com/sec/index.html']

<br/>
# **Stemming / Tagging**
> **nltk**

In [15]:
text_eng = text_eng.lower()
text_eng

" don't hesitate to ask questions"

In [16]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
token     = tokenizer.tokenize(text_eng)
token

['do', "n't", 'hesitate', 'to', 'ask', 'questions']

In [17]:
from nltk import pos_tag
pos_tag(token)

[('do', 'VBP'),
 ("n't", 'RB'),
 ('hesitate', 'VB'),
 ('to', 'TO'),
 ('ask', 'VB'),
 ('questions', 'NNS')]

In [18]:
import nltk.help as nltk_help
nltk_help.upenn_tagset('PRP')  # 대명사

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [19]:
nltk_help.upenn_tagset('JJ')  # 형용사

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


<br/>
# **Stemming / Tagging (한글)**
> **konlpy**

In [20]:
from konlpy.tag import Okt, Kkma, Hannanum
twitter = Okt()
text    = "파이썬을 활용하여 자연어 분석 특강입니다"
print(twitter.pos(text, stem="true")) 

[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하다', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('이다', 'Adjective')]


In [21]:
print(twitter.pos(text))

[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하여', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('입니다', 'Adjective')]


In [22]:
%%time
kkma = Kkma()
print(kkma.pos(text))

[('파이', 'NNG'), ('썰', 'VV'), ('ㄴ', 'ETD'), ('을', 'NNG'), ('활용', 'NNG'), ('하', 'XSV'), ('여', 'ECS'), ('자연어', 'NNG'), ('분석', 'NNG'), ('특강', 'NNG'), ('이', 'VCP'), ('ㅂ니다', 'EFN')]
CPU times: user 17.8 s, sys: 158 ms, total: 18 s
Wall time: 6.13 s


In [23]:
%%time
han = Hannanum()
print(han.pos(text))

[('파이썬', 'N'), ('을', 'J'), ('활용', 'N'), ('하', 'X'), ('어', 'E'), ('자연어', 'N'), ('분석', 'N'), ('특강', 'N'), ('이', 'J'), ('ㅂ니다', 'E')]
CPU times: user 2.28 s, sys: 35.9 ms, total: 2.32 s
Wall time: 739 ms


In [24]:
from konlpy.tag import Mecab

In [25]:
mecab = Mecab()

In [26]:
mecab.pos(text_kor)

[('삼성', 'NNP'),
 ('갤럭시', 'NNP'),
 ('(', 'SSO'),
 ('GalaxyNote', 'SL'),
 (')', 'SSC'),
 ('노트', 'NNG'),
 ('의', 'JKG'),
 ('신', 'XPN'),
 ('형', 'NNG'),
 ('을', 'JKO'),
 ('홍보', 'NNG'),
 ('합니다', 'XSV+EF'),
 ('.', 'SF'),
 ('홍보', 'NNG'),
 (':', 'SC'),
 ('유관순', 'NNP'),
 ('010', 'SN'),
 ('-', 'SY'),
 ('8888', 'SN'),
 ('-', 'SY'),
 ('9999', 'SN'),
 ('031', 'SN'),
 ('-', 'SY'),
 ('444', 'SN'),
 ('-', 'SY'),
 ('5555', 'SN'),
 ('.', 'SF'),
 ('삼성', 'NNP'),
 ('페이지', 'NNG'),
 ('https', 'SL'),
 (':', 'SC'),
 ('/', 'SC'),
 ('/', 'SC'),
 ('www', 'SL'),
 ('.', 'SY'),
 ('samsung', 'SL'),
 ('.', 'SY'),
 ('com', 'SL'),
 ('/', 'SC'),
 ('sec', 'SL'),
 ('/', 'SC'),
 ('index', 'SL'),
 ('.', 'SY'),
 ('html', 'SL')]