# 텍스트 처리

In [None]:
s = 'No pain no gain'

In [None]:
'pain' in s 

True

In [None]:
s.split()

['No', 'pain', 'no', 'gain']

In [None]:
s.split().index('gain')

3

In [None]:
s[-4::]

'gain'

In [None]:
s.split()[1]

'pain'

In [None]:
s.split()[1][::-1]

'niap'

In [None]:
s.split()[2][::-1]

'on'

In [None]:
s = '한글도 처리 가능'

In [None]:
'처리' in s

True

In [None]:
s.split()

['한글도', '처리', '가능']

In [None]:
s.split()[0]

'한글도'

# 영어 처리
* 대소문자를 통합하지 않으면, 같은 단어를 다르게 이해
* upper()와 lower()를 사용

In [None]:
s = 'AbCdeFgH'
str_lower = s.lower()
str_upper = s.upper()
print(str_lower, str_upper)

abcdefgh ABCDEFGH


## 정규화

In [None]:
s = 'I visited UK from US on 22-09-20'
print(s)

I visited UK from US on 22-09-20


In [None]:
new_s = s.replace("UK", "United Kingdom").replace("US", "United states").replace('-20', '-2020')
print(new_s)

I visited United Kingdom from United states on 22-09-2020


## 정규표현식
* 문자들을 편리하게 지정, 추가, 삭제 가능

## match

In [None]:
import re
check = 'ab.'
print(re.match(check, 'abc'))
print(re.match(check, 'b'))
print(re.match(check, 'ab'))

<re.Match object; span=(0, 3), match='abc'>
None
None


## compile

In [None]:
import time
normal_s_time = time.time()
r = 'ab'
for i in range(1000):
  re.match(check, 'abc')
print('일반 사용 소모 시간: ', time.time() - normal_s_time)

compile_s_time = time.time()
r = re.compile('ab.')
for i in range(1000):
  r.match(check)
print('컴파일 사용시 소모 시간 : ', time.time() -compile_s_time)

일반 사용 소모 시간:  0.0018124580383300781
컴파일 사용시 소모 시간 :  0.0006003379821777344


## search
* match와는 다르게 문자열의 전체를 검사

In [None]:
check = 'ab?'

print(re.search('a', check))
print(re.match('kkkab', check))
print(re.search('kkkab', check))
print(re.match('ab', check))

<re.Match object; span=(0, 1), match='a'>
None
None
<re.Match object; span=(0, 2), match='ab'>


## split
* 정규 표현식에 해당하는 문자열을 기준으로 문자열을 나눔

In [None]:
r = re.compile(' ')
print(r.split('I am a car'))

['I', 'am', 'a', 'car']


In [None]:
r = re.compile('[1-9]')
print(r.split('slavr 2v4s 4sss 54f'))

['slavr ', 'v', 's ', 'sss ', '', 'f']


## findall
* 컴파일한 정규 표현식을 이욯해 정규 표현식과 맞는 모든 문자열을 리스트화

In [None]:
print(re.findall('[\d]', '1ab 2cd 3ef 4g'))

['1', '2', '3', '4']


## 토큰화

### 단어 토큰화
* 파이썬 내장 함수인 split 사용

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

sentence = 'Time is Gold'
tokens = word_tokenize(sentence)
tokens

['Time', 'is', 'Gold']

### 문장 토큰화

In [None]:
sentences = 'The world is a beautiful book.\n But of little use to him who cannot read it'
print(sentences)

token = [x for x in sentences.split('\n')]
print(token)

The world is a beautiful book.
 But of little use to him who cannot read it
['The world is a beautiful book.', ' But of little use to him who cannot read it']


In [None]:
# /n 이 아닌 sent_tokenize를 사용
from nltk.tokenize import sent_tokenize

tokens = sent_tokenize(sentences)
tokens

['The world is a beautiful book.',
 'But of little use to him who cannot read it']

### 정규 표현식을 활용한 토큰화

In [None]:
from nltk.tokenize import RegexpTokenizer

sentences = 'Where where \'s a will, there\'s a way'
tokenizer = RegexpTokenizer('[\w]+')
tokens = tokenizer.tokenize(sentences)
tokens

['Where', 'where', 's', 'a', 'will', 'there', 's', 'a', 'way']

In [None]:
tokenizer = RegexpTokenizer('[\s]+', gaps = True)
tokens = tokenizer.tokenize(sentences)
tokens

['Where', 'where', "'s", 'a', 'will,', "there's", 'a', 'way']

### 케라스를 사용한 토큰화

In [None]:
from keras.preprocessing.text import text_to_word_sequence
sentence = 'Where there \'s a will, there \'s a way'
text_to_word_sequence(sentence)

['where', 'there', "'s", 'a', 'will', 'there', "'s", 'a', 'way']

### textblob 을 사용한 토큰화

In [None]:
from textblob import TextBlob
sentence = 'Where there \'s a will, there \'s a way'
blob =TextBlob(sentence)
blob.words

WordList(['Where', 'there', "'s", 'a', 'will', 'there', "'s", 'a', 'way'])

## n-gram 추출
* n-gram은 n개의 어절이나 음절을 연쇄적으로 분류해 그 빈도를 분석

In [None]:
from nltk import ngrams
sentence = 'Where there a will, there a way'
biagram = list(ngrams(sentence.split(),2))
print(biagram)

[('Where', 'there'), ('there', 'a'), ('a', 'will,'), ('will,', 'there'), ('there', 'a'), ('a', 'way')]


In [None]:
trigram = list(ngrams(sentence.split(), 3))
print(trigram)

[('Where', 'there', 'a'), ('there', 'a', 'will,'), ('a', 'will,', 'there'), ('will,', 'there', 'a'), ('there', 'a', 'way')]


In [None]:
from textblob import TextBlob
blob = TextBlob(sentence)
blob.ngrams(n = 2)

[WordList(['Where', 'there']),
 WordList(['there', 'a']),
 WordList(['a', 'will']),
 WordList(['will', 'there']),
 WordList(['there', 'a']),
 WordList(['a', 'way'])]

In [None]:
blob.ngrams(n = 3)

[WordList(['Where', 'there', 'a']),
 WordList(['there', 'a', 'will']),
 WordList(['a', 'will', 'there']),
 WordList(['will', 'there', 'a']),
 WordList(['there', 'a', 'way'])]

## Pos 태깅
* 품사를 의미하며, Pos 태깅은 문장 내에서 단어에 해당하는 각 품사를 태깅

In [None]:
import nltk
nltk.download('punkt')

from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
words = word_tokenize('Where is my car')
words

['Where', 'is', 'my', 'car']

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(words)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('Where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('car', 'NN')]

In [None]:
nltk.pos_tag(word_tokenize('A rollong stone gathers no mass'))

[('A', 'DT'),
 ('rollong', 'JJ'),
 ('stone', 'NN'),
 ('gathers', 'NNS'),
 ('no', 'DT'),
 ('mass', 'NN')]

## 불용어 제거

In [None]:
stop_words = 'on in the'
stop_words = stop_words.split(' ')
stop_words

['on', 'in', 'the']

In [None]:
sentence = 'singer on the stage'
sentence = sentence.split(' ')
nouns = []
for noun in sentence:
  if noun not in stop_words:
    nouns.append(noun)
print(nouns)

['singer', 'stage']


### nltk 패키지에 불용어 리스트를 사용

In [None]:
import nltk
nltk.download('stopwords')

from nltk import word_tokenize
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
s = 'If you do not walk today, you will have to run tomorrow'

words = word_tokenize(s)
print(words)

['If', 'you', 'do', 'not', 'walk', 'today', ',', 'you', 'will', 'have', 'to', 'run', 'tomorrow']


In [None]:
no_stopwords = []
for w in words:
  if w not in stop_words:
    no_stopwords.append(w)
print(no_stopwords)

['If', 'walk', 'today', ',', 'run', 'tomorrow']


## 철자 교정

In [None]:
!pip install autocorrect

Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[?25l[K     |▌                               | 10 kB 26.6 MB/s eta 0:00:01[K     |█                               | 20 kB 9.6 MB/s eta 0:00:01[K     |█▋                              | 30 kB 8.2 MB/s eta 0:00:01[K     |██                              | 40 kB 7.3 MB/s eta 0:00:01[K     |██▋                             | 51 kB 4.2 MB/s eta 0:00:01[K     |███▏                            | 61 kB 4.4 MB/s eta 0:00:01[K     |███▊                            | 71 kB 4.4 MB/s eta 0:00:01[K     |████▏                           | 81 kB 5.0 MB/s eta 0:00:01[K     |████▊                           | 92 kB 3.9 MB/s eta 0:00:01[K     |█████▎                          | 102 kB 4.1 MB/s eta 0:00:01[K     |█████▉                          | 112 kB 4.1 MB/s eta 0:00:01[K     |██████▎                         | 122 kB 4.1 MB/s eta 0:00:01[K     |██████▉                         | 133 kB 4.1 MB/s eta 0:00:01[K    

In [None]:
from autocorrect import Speller

In [None]:
spell = Speller('en')
print(spell('people'))
print(spell('peope'))
print(spell('peopae'))

people
people
people


In [None]:
s = word_tokenize('Earlly biird catchess the womm')
print(s)

ss = ' '.join([spell(s) for s in s])
print(ss)

['Earlly', 'biird', 'catchess', 'the', 'womm']
Early bird catches the worm


## 언어의 단수화 복수화

In [None]:
from textblob import TextBlob

words = 'apples bananas oranges'
tb =TextBlob(words)

print(tb.words)
print(tb.words.singularize())

['apples', 'bananas', 'oranges']
['apple', 'banana', 'orange']


In [None]:
words = 'car train airplane'
tb =TextBlob(words)

print(tb)
print(tb.words.pluralize())

car train airplane
['cars', 'trains', 'airplanes']


## 어간 추출

In [None]:
import nltk

stemmer = nltk.stem.PorterStemmer()

In [None]:
stemmer.stem('application')

'applic'

In [None]:
stemmer.stem('beginning')

'begin'

## 표제어 추출

In [None]:
import nltk
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
lemmatizer.lemmatize('application')

'application'

In [None]:
lemmatizer.lemmatize('beginning')

'beginning'

## 개체명 인식

In [None]:
import nltk 
from nltk import word_tokenize

nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
s = 'Rome was not built in a day'

In [None]:
tags = nltk.pos_tag(word_tokenize(s))
print(tags)

[('Rome', 'NNP'), ('was', 'VBD'), ('not', 'RB'), ('built', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('day', 'NN')]


In [None]:
entites = nltk.ne_chunk(tags, binary = True)
print(entites)

(S (NE Rome/NNP) was/VBD not/RB built/VBN in/IN a/DT day/NN)


# 한국어 처리
* 한국어는 자음과 모음으로 이루어져 있기에 주의

## match

In [None]:
import re

check = '[ㄱ-ㅎ]+'

print(re.match(check, 'ㅎ 안녕하세요'))
print(re.match(check, '안녕하세요 ㅎ')) # 문장 처음 시작이 자음과 모음이 합쳐져 있기에 인식 불가

<re.Match object; span=(0, 1), match='ㅎ'>
None


## search
* match와 는 달리 문자열 전체를 검사

In [None]:
check = '[ㄱ-ㅎ|ㅏ-ㅣ]'
print(re.search(check, 'ㄱㅏ 안녕하세요'))
print(re.search(check, '안 ㄱ ㅏ'))


<re.Match object; span=(0, 1), match='ㄱ'>
<re.Match object; span=(2, 3), match='ㄱ'>


## 토큰화

### 한국어 자연어 처리 konlpy와 형태소 분석기 Mecab 설치

In [None]:
!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

### 단어 토큰화

In [None]:
from konlpy.tag import Mecab
tagger = Mecab()

In [None]:
sentence = '언제나 현재에 집중할 수 있다면 행복할 것이다.'
tagger.pos(sentence)

[('언제나', 'MAG'),
 ('현재', 'NNG'),
 ('에', 'JKB'),
 ('집중', 'NNG'),
 ('할', 'XSV+ETM'),
 ('수', 'NNB'),
 ('있', 'VV'),
 ('다면', 'EC'),
 ('행복', 'NNG'),
 ('할', 'XSV+ETM'),
 ('것', 'NNB'),
 ('이', 'VCP'),
 ('다', 'EF'),
 ('.', 'SF')]

In [None]:
# 토큰화만 실행할 경우 tagger.morphs()라는 함수를 사용
tagger.morphs(sentence)

['언제나', '현재', '에', '집중', '할', '수', '있', '다면', '행복', '할', '것', '이', '다', '.']

In [None]:
# 형태소만 출력
tagger.nouns(sentence)

['현재', '집중', '수', '행복', '것']

## 문장 토큰화

In [None]:
! pip install kss

Collecting kss
  Downloading kss-3.3.1.1.tar.gz (42.4 MB)
[K     |████████████████████████████████| 42.4 MB 1.5 MB/s 
[?25hCollecting emoji
  Downloading emoji-1.6.3.tar.gz (174 kB)
[K     |████████████████████████████████| 174 kB 48.2 MB/s 
[?25hBuilding wheels for collected packages: kss, emoji
  Building wheel for kss (setup.py) ... [?25l[?25hdone
  Created wheel for kss: filename=kss-3.3.1.1-py3-none-any.whl size=42449241 sha256=b130e2b0cf2ab204426927ed4ea9f7b2c2d83d508856ab62c67f2f35d949905a
  Stored in directory: /root/.cache/pip/wheels/6e/9d/1d/52871154eff5273abb86b96f4f984c1cd67c5bde64239b060a
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.6.3-py3-none-any.whl size=170298 sha256=41d033b543ee1a033494ba33af37a7bc64a1b7798921b82bb4c161a939af97bd
  Stored in directory: /root/.cache/pip/wheels/03/8b/d7/ad579fbef83c287215c0caab60fb0ae0f30c4d7ce5f580eade
Successfully built kss emoji
Installing collected packages: emoji, kss

In [None]:
import kss 
text = '진짜? 내일 뭐하지. 이렇게 애매모호한 문장도? 밥은 먹었어? 나는 ...'
print(kss.split_sentences(text))



['진짜? 내일 뭐하지.', '이렇게 애매모호한 문장도? 밥은 먹었어?', '나는 ...']


## 정규 표현식을 이용한 토큰화

In [None]:
from nltk.tokenize import RegexpTokenizer
sentence = "안녕하세요 ㅋㅋ 저는 자연어 처리(NLP)를 배우고 있습니다!! ㅋㅋㅋ"

tokenizer = RegexpTokenizer('[가-힣]+')
tokens = tokenizer.tokenize(sentence)
tokens

['안녕하세요', '저는', '자연어', '처리', '를', '배우고', '있습니다']

## 케라스를 이용한 토큰화

In [None]:
from keras.preprocessing.text import text_to_word_sequence

sentence = "성공의 비결은 단 한가지, 잘할 수 있는 일에 광적으로 집중하는 것이다"

text_to_word_sequence(sentence)

['성공의', '비결은', '단', '한가지', '잘할', '수', '있는', '일에', '광적으로', '집중하는', '것이다']

## Bag of Words 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['Think like a man of action and act like man of thought']

vector = CountVectorizer()
bow = vector.fit_transform(corpus)

print(bow.toarray())
print(vector.vocabulary_)

[[1 1 1 2 2 2 1 1]]
{'think': 6, 'like': 3, 'man': 4, 'of': 5, 'action': 1, 'and': 2, 'act': 0, 'thought': 7}


In [None]:
corpus = ['평생 살 것처럼 꿈을 꾸어라. 그리고 내일 죽을 것처럼 오늘을 살아라.']

vector = CountVectorizer()
bow = vector.fit_transform(corpus)

print(bow.toarray())
print(vector.vocabulary_)

[[2 1 1 1 1 1 1 1 1]]
{'평생': 8, '것처럼': 0, '꿈을': 3, '꾸어라': 2, '그리고': 1, '내일': 4, '죽을': 7, '오늘을': 6, '살아라': 5}


In [None]:
import re 
from konlpy.tag import Mecab

corpus = '평생 살 것처럼 꿈을 꾸어라. 그리고 내일 죽을 것처럼 오늘을 살아라.'
tokens = tagger.morphs(re.sub("(\.)", "", corpus))

vocab = {}
bow = []

for tok in tokens:
  if tok not in vocab.keys():
    vocab[tok] = len(vocab)
    bow.insert(len(vocab)-1,1)
  else:
    index = vocab.get(tok)
    bow[index] = bow[index]+1

print(bow)
print(vocab)

[1, 2, 2, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1]
{'평생': 0, '살': 1, '것': 2, '처럼': 3, '꿈': 4, '을': 5, '꾸': 6, '어라': 7, '그리고': 8, '내일': 9, '죽': 10, '오늘': 11, '아라': 12}


## 문서 단어 행렬

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["Think like a man of action and act like man of thoughts ",
          "Try not to become a man of success but rather try to become a man of value"]

vector = CountVectorizer(stop_words = 'english')
bow = vector.fit_transform(corpus)

print(bow.toarray())
print(vector.vocabulary_)

[[1 1 2 2 0 1 1 0 0]
 [0 0 0 2 1 0 0 2 1]]
{'think': 5, 'like': 2, 'man': 3, 'action': 1, 'act': 0, 'thoughts': 6, 'try': 7, 'success': 4, 'value': 8}


## 어휘 빈도-문서 역빈도(TF-IDF) 분석

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer(stop_words = 'english').fit(corpus)

print(tfidv.transform(corpus).toarray())
print(tfidv.vocabulary_)

[[0.3158336  0.3158336  0.6316672  0.44943642 0.         0.3158336
  0.3158336  0.         0.        ]
 [0.         0.         0.         0.50232878 0.35300279 0.
  0.         0.70600557 0.35300279]]
{'think': 5, 'like': 2, 'man': 3, 'action': 1, 'act': 0, 'thoughts': 6, 'try': 7, 'success': 4, 'value': 8}


### dataframe으로 변환

In [None]:
import pandas as pd

columns = []
for k, v in sorted(tfidv.vocabulary_.items(), key = lambda item: item[1]):
  columns.append(k)
pd.DataFrame(tfidv.transform(corpus).toarray(), columns = columns)

Unnamed: 0,act,action,like,man,success,think,thoughts,try,value
0,0.315834,0.315834,0.631667,0.449436,0.0,0.315834,0.315834,0.0,0.0
1,0.0,0.0,0.0,0.502329,0.353003,0.0,0.0,0.706006,0.353003
