# 텍스트 전처리(Preprocessing)

### 1.토큰화(Tokenization)

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### 1) 단어 토큰화

In [2]:
sample = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."

In [3]:
from nltk.tokenize import word_tokenize
print(word_tokenize(sample))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [4]:
from nltk.tokenize import WordPunctTokenizer
wpt = WordPunctTokenizer()
print(wpt.tokenize(sample))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [5]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print(text_to_word_sequence(sample))

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


In [6]:
from nltk.tokenize import TreebankWordTokenizer
tok = TreebankWordTokenizer()
print(tok.tokenize(sample))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


### 2) 문장 토큰화

In [8]:
from nltk.tokenize import sent_tokenize
text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."
print(sent_tokenize(text))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


In [13]:
for sentence in sent_tokenize(text):
  print(sent_tokenize(sentence))

["Since I'm actively looking for Ph.D. students, I get the same question a dozen times every year."]


In [12]:
text =  "Since I'm actively looking for Ph.D. students, I get the same question a dozen times every year."
print(sent_tokenize(text))

["Since I'm actively looking for Ph.D. students, I get the same question a dozen times every year."]


In [14]:
# KSS(Korean Sentence Splitter) 설치
!pip install kss

Collecting kss
  Downloading kss-3.3.1.1.tar.gz (42.4 MB)
[K     |████████████████████████████████| 42.4 MB 1.3 MB/s 
[?25hCollecting emoji
  Downloading emoji-1.6.1.tar.gz (170 kB)
[K     |████████████████████████████████| 170 kB 59.0 MB/s 
[?25hBuilding wheels for collected packages: kss, emoji
  Building wheel for kss (setup.py) ... [?25l[?25hdone
  Created wheel for kss: filename=kss-3.3.1.1-py3-none-any.whl size=42449239 sha256=9d99387503b6455320fa30cc7641e244b7af520b3fd27b0d75d3e70214725e33
  Stored in directory: /root/.cache/pip/wheels/6e/9d/1d/52871154eff5273abb86b96f4f984c1cd67c5bde64239b060a
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.6.1-py3-none-any.whl size=169314 sha256=ae53ef130ca558d9f7ffd328c370c900746fbb285970405f3803925f50e0c996
  Stored in directory: /root/.cache/pip/wheels/ea/5f/d3/03d313ddb3c2a1a427bb4690f1621eea60fe6f2a30cc95940f
Successfully built kss emoji
Installing collected packages: emoji, kss

In [15]:
# KSS(Korean Sentence Splitter) 설치시 메시지를 보고 싶지 않을 때
!pip install kss > /dev/null

In [16]:
import kss

text = '딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어렵습니다. 이제 해보면 알걸요?'
print('한국어 문장 토큰화 :',kss.split_sentences(text))

[Korean Sentence Splitter]: Initializing Pynori...


한국어 문장 토큰화 : ['딥 러닝 자연어 처리가 재미있기는 합니다.', '그런데 문제는 영어보다 한국어로 할 때 너무 어렵습니다.', '이제 해보면 알걸요?']


3) 품사(POS : Part-Of-Speech)태깅

In [17]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D. student."
print(word_tokenize(text))

['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']


In [18]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [19]:
from nltk.tag import pos_tag
pos_tag(word_tokenize(text))

[('I', 'PRP'),
 ('am', 'VBP'),
 ('actively', 'RB'),
 ('looking', 'VBG'),
 ('for', 'IN'),
 ('Ph.D.', 'NNP'),
 ('students', 'NNS'),
 ('.', '.'),
 ('and', 'CC'),
 ('you', 'PRP'),
 ('are', 'VBP'),
 ('a', 'DT'),
 ('Ph.D.', 'NNP'),
 ('student', 'NN'),
 ('.', '.')]

- 한글 (KoNLPy)

In [20]:
# KoNLPy 설치
!pip install KoNLPy

Collecting KoNLPy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.5 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 73.3 MB/s 
Installing collected packages: JPype1, KoNLPy
Successfully installed JPype1-1.3.0 KoNLPy-0.6.0


- okt(Open Korean Text)

In [22]:
# 형태소 분석
from konlpy.tag import Okt
okt = Okt()
text = '열심히 코딩한 당신, 연휴에는 여행을 가봐요'
okt.morphs(text)

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']

In [23]:
okt.morphs(text, stem=True) # 용언(어간)을 추출함

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가보다']

In [24]:
# 품사 부착
okt.pos(text)

[('열심히', 'Adverb'),
 ('코딩', 'Noun'),
 ('한', 'Josa'),
 ('당신', 'Noun'),
 (',', 'Punctuation'),
 ('연휴', 'Noun'),
 ('에는', 'Josa'),
 ('여행', 'Noun'),
 ('을', 'Josa'),
 ('가봐요', 'Verb')]

In [25]:
# 명사 추출
okt.nouns(text)

['코딩', '당신', '연휴', '여행']

- 꼬꼬마

In [27]:
from konlpy.tag import Kkma
kkma = Kkma()
kkma.morphs(text)

['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']

In [28]:
kkma.nouns(text)

['코딩', '당신', '연휴', '여행']

### 2. 정제(Cleaning)와 정규화(Normalization)

In [32]:
import re
text = "I was wondering if anyone out there could enlighten me on this car."

In [39]:
# 길이가 1~2인 단어들의 정규 표현식을 이용하여 삭제
shortword = re.compile(r'\W*\b\w{1,2}\b')
shortword.sub('',text)

' was wondering anyone out there could enlighten this car.'

In [34]:
[word for word in word_tokenize(text) if len(word)>2]

['was',
 'wondering',
 'anyone',
 'out',
 'there',
 'could',
 'enlighten',
 'this',
 'car']

In [37]:
clean_text = ' '.join([word for word in word_tokenize(text) if len(word)>2])
clean_text

'was wondering anyone out there could enlighten this car'

### 3. 어간 추출(Stemming) 및 표제어 추출(Lemmatization)

#### 1) 표제어 추출(Lemmatization)

In [40]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [45]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [47]:
words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print([lemma.lemmatize(word) for word in words])

['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [49]:
lemma.lemmatize('doing','v'),lemma.lemmatize('dies','v'),lemma.lemmatize('watched','v'),lemma.lemmatize('has','v'),

('do', 'die', 'watch', 'have')

In [50]:
lemma.lemmatize('lives','v'),lemma.lemmatize('lives','n'),

('live', 'life')

#### 2) 어간 추출(Stemming)

In [52]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
text = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."

In [53]:
# 어간 추출전
print(word_tokenize(text))

['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']


In [55]:
# 어간 추출후
print([ps.stem(word) for word in word_tokenize(text)])

['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [56]:
words = ['formalize', 'allowance', 'electrical']
print([ps.stem(word) for word in words])

['formal', 'allow', 'electr']


In [58]:
# Porter Stemmer
words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print ([ps.stem(word) for word in words])

['polici', 'do', 'organ', 'have', 'go', 'love', 'live', 'fli', 'die', 'watch', 'ha', 'start']


In [63]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
print([ls.stem(word) for word in words])

['policy', 'doing', 'org', 'hav', 'going', 'lov', 'liv', 'fly', 'die', 'watch', 'has', 'start']
