# **Chapter 1 자연어 다루기**
## **Token:** 어휘 추출하기

In [1]:
# ! apt-get update
# ! apt-get install g++ openjdk-8-jdk 
# ! pip3  install  nltk konlpy wordcloud matplotlib gensim 

In [2]:
import nltk
# nltk.download('punkt')
# nltk.download('tagsets')
# nltk.download('averaged_perceptron_tagger')
text_eng  = " Don't hesitate to ask questions"
text_kor  = """삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. 
홍보:유관순 010-8888-9999.
삼성 페이지 https://www.samsung.com/sec/index.html"""

In [3]:
text_kor

'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \n홍보:유관순 010-8888-9999.\n삼성 페이지 https://www.samsung.com/sec/index.html'

In [4]:
from nltk import sent_tokenize, word_tokenize, FreqDist
sent_tokenize(text_kor)

['삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다.',
 '홍보:유관순 010-8888-9999.',
 '삼성 페이지 https://www.samsung.com/sec/index.html']

In [5]:
tokens = word_tokenize(text_kor)
tokens

['삼성',
 '갤럭시',
 '(',
 'GalaxyNote',
 ')',
 '노트의',
 '신형을',
 '홍보',
 '합니다',
 '.',
 '홍보',
 ':',
 '유관순',
 '010-8888-9999',
 '.',
 '삼성',
 '페이지',
 'https',
 ':',
 '//www.samsung.com/sec/index.html']

In [6]:
dict(FreqDist(tokens))

{'삼성': 2,
 '갤럭시': 1,
 '(': 1,
 'GalaxyNote': 1,
 ')': 1,
 '노트의': 1,
 '신형을': 1,
 '홍보': 2,
 '합니다': 1,
 '.': 2,
 ':': 2,
 '유관순': 1,
 '010-8888-9999': 1,
 '페이지': 1,
 'https': 1,
 '//www.samsung.com/sec/index.html': 1}

<br/>
## **Re 를 사용한 Regex 정규식**
https://news.v.daum.net/v/20190223110230553

In [7]:
text_kor

'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \n홍보:유관순 010-8888-9999.\n삼성 페이지 https://www.samsung.com/sec/index.html'

In [8]:
import re
tokenizer = re.compile(r'[가-힣]+')
tokenizer.findall(text_kor)

['삼성', '갤럭시', '노트의', '신형을', '홍보', '합니다', '홍보', '유관순', '삼성', '페이지']

In [9]:
tokenizer = re.compile(r'[0-9]{3}-[0-9]{3,4}-[0-9]{4}')
tokenizer.findall(text_kor)

['010-8888-9999']

In [10]:
tokenizer = re.compile(r'\d{3}-\d{3,4}-\d{4}')
tokenizer.findall(text_kor)

['010-8888-9999']

In [11]:
tokenizer = re.compile(r'[^ 가-힣]+')
tokenizer.findall(text_kor)

['(GalaxyNote)',
 '.',
 '\n',
 ':',
 '010-8888-9999.\n',
 'https://www.samsung.com/sec/index.html']

In [12]:
tokenizer.sub("", text_kor)#.split(" ")

'삼성 갤럭시노트의 신형을 홍보 합니다 홍보유관순 삼성 페이지 '

In [13]:
tokenizer = re.compile(r'https://[w]{3}.[A-z]+.[./A-z]+')
tokenizer.findall(text_kor)

['https://www.samsung.com/sec/index.html']

<br/>
# **Stemming / Tagging**
> **nltk**

In [14]:
text_eng = text_eng.lower()
text_eng

" don't hesitate to ask questions"

In [15]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
token     = tokenizer.tokenize(text_eng)
token

['do', "n't", 'hesitate', 'to', 'ask', 'questions']

In [16]:
from nltk import pos_tag
pos_tag(token)

[('do', 'VBP'),
 ("n't", 'RB'),
 ('hesitate', 'VB'),
 ('to', 'TO'),
 ('ask', 'VB'),
 ('questions', 'NNS')]

In [17]:
import nltk.help as nltk_help
nltk_help.upenn_tagset('PRP')  # 대명사

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [18]:
nltk_help.upenn_tagset('JJ')  # 형용사

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


<br/>
# **Stemming / Tagging (한글)**
> **konlpy**

In [19]:
from konlpy.tag import Okt
twitter = Okt()

# Stemming
text = "파이썬을 활용하여 자연어 분석 특강입니다"
print(twitter.pos(text, stem="true")) 

[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하다', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('이다', 'Adjective')]


In [20]:
print(twitter.pos(text))

[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하여', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('입니다', 'Adjective')]


In [21]:
%%time
from konlpy.tag import Kkma
kkma = Kkma()
print(kkma.pos(text))

[('파이', 'NNG'), ('썰', 'VV'), ('ㄴ', 'ETD'), ('을', 'NNG'), ('활용', 'NNG'), ('하', 'XSV'), ('여', 'ECS'), ('자연어', 'NNG'), ('분석', 'NNG'), ('특강', 'NNG'), ('이', 'VCP'), ('ㅂ니다', 'EFN')]
CPU times: user 16.8 s, sys: 223 ms, total: 17 s
Wall time: 5.88 s


In [22]:
%%time
from konlpy.tag import Hannanum
han = Hannanum()
print(han.pos(text))

[('파이썬', 'N'), ('을', 'J'), ('활용', 'N'), ('하', 'X'), ('어', 'E'), ('자연어', 'N'), ('분석', 'N'), ('특강', 'N'), ('이', 'J'), ('ㅂ니다', 'E')]
CPU times: user 7.59 s, sys: 82.9 ms, total: 7.68 s
Wall time: 2.19 s
