# 목차
### 1. NLTK자연어처리 패키지
### 2. KoNLPy 한국어처리 패키지
### 3. Scikit-Learn 문서전처리 기능

---
# 1. NLTK자연어처리 패키지 

In [2]:
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download("gutenberg")
nltk.download('punkt')
nltk.download('reuters')
nltk.download("stopwords")
nltk.download("webtext")
nltk.download("wordnet")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\creal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\creal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\creal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\creal\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\creal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\creal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\webtext.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\creal\AppData\Roaming\nltk_data...
[nltk_dat

True

In [3]:
#예를 들어 저작권이 말소된 문학작품을 포함하는 gutenberg 말뭉치에는 다음과 같은 작품이 샘플로 포함되어 있다.

nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [8]:
doc_raw = nltk.corpus.gutenberg.raw("austen-persuasion.txt")
print(doc_raw[:500])

[Persuasion by Jane Austen 1818]


Chapter 1


Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who,
for his own amusement, never took up any book but the Baronetage;
there he found occupation for an idle hour, and consolation in a
distressed one; there his faculties were roused into admiration and
respect, by contemplating the limited remnant of the earliest patents;
there any unwelcome sensations, arising from domestic affairs
changed naturally into pity and contempt as he turn


In [10]:
# 토큰생성----------------------------------------------

from nltk.tokenize import sent_tokenize
print(sent_tokenize(doc_raw[:1500])[3])

of South Park, in the county of
Gloucester, by which lady (who died 1800) he has issue Elizabeth,
born June 1, 1785; Anne, born August 9, 1787; a still-born son,
November 5, 1789; Mary, born November 20, 1791."


In [12]:
from nltk.tokenize import RegexpTokenizer
t = RegexpTokenizer("[\w]+")
t.tokenize(doc_raw[50:100])

['Walter', 'Elliot', 'of', 'Kellynch', 'Hall', 'in', 'Somersetshire']

In [14]:
# 형태소 분석--(어간추출stemming, 원형복원lemmatizing, 품사부착Part Of Speech tagging)-----------

# -- 어간추출
words = ['lives', 'dies', 'flies', 'died']

from nltk.stem import PorterStemmer
st = PorterStemmer()
[st.stem(w) for w in words]

['live', 'die', 'fli', 'die']

In [15]:
from nltk.stem import LancasterStemmer
st = LancasterStemmer()
[st.stem(w) for w in words]

['liv', 'die', 'fli', 'died']

In [17]:
# -- 어간추출은 원형복원의 일종이다
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
[lm.lemmatize(w) for w in words]

['life', 'dy', 'fly', 'died']

In [20]:
lm.lemmatize("died", pos="v")

'die'

In [21]:
#POS tagging(품사-태깅)

# NN명사
# PRP인칭대명사
# CD서수
# DT관형사
# VBP 동사 현재형

# 국내 태그세트는 21세기 세종계획 품사태그가 있다.

from nltk.tag import pos_tag
x = ["volumne", "I", "chapter", "1", "I", "am", "a", "boy", "."]
tagged_list = pos_tag(x)
tagged_list

[('volumne', 'NN'),
 ('I', 'PRP'),
 ('chapter', 'VBP'),
 ('1', 'CD'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('boy', 'NN'),
 ('.', '.')]

In [22]:
from nltk.tag import untag
untag(tagged_list)

['volumne', 'I', 'chapter', '1', 'I', 'am', 'a', 'boy', '.']

In [23]:
# 자연어분석을 할때 새로운 토큰을 만들어 사용하기도 한다
def tokenizer(doc):
    return ["/".join(p) for p in pos_tag(doc)]

tokenizer(x)

['volumne/NN',
 'I/PRP',
 'chapter/VBP',
 '1/CD',
 'I/PRP',
 'am/VBP',
 'a/DT',
 'boy/NN',
 './.']

# 2. KoNLPy 한국어처리 패키지 (코앤엘 파이)

- https://datascienceschool.net/view-notebook/70ce46db4ced4a999c6ec349df0f4eb0/


In [34]:
# 한국어 말뭉치

from konlpy.corpus import kolaw
kolaw.fileids()

['constitution.txt']

In [35]:
c = kolaw.open('constitution.txt').read()
print(c[:66])

대한민국헌법

유구한 역사와 전통에 빛나는 우리 대한국민은 3·1운동으로 건립된 대한민국임시정부의 법통과 불의에 항거한


In [36]:
from konlpy.corpus import kobill
kobill.fileids()

['1809890.txt',
 '1809891.txt',
 '1809892.txt',
 '1809893.txt',
 '1809894.txt',
 '1809895.txt',
 '1809896.txt',
 '1809897.txt',
 '1809898.txt',
 '1809899.txt']

In [37]:
d = kobill.open('1809890.txt').read()
print(d[:100])

지방공무원법 일부개정법률안

(정의화의원 대표발의 )

 의 안
 번 호

9890

발의연월일 : 2010.  11.  12.  

발  의  자 : 정의화․이명수․김을동 

이


In [71]:
# 형태소 분석----------------------------------

from konlpy.tag import *
hannanum = Hannanum()
kkma = Kkma()
twitter = Twitter()


ModuleNotFoundError: No module named 'jpype'

In [None]:
# 오루가 있어서 중단... 위의 사이트 림크에서 하는방법 참조바람

# Scikit-Learn 문서전처리 기능
- https://datascienceschool.net/view-notebook/3e7aadbf88ed4f0d87a76f9ddc925d69/

In [39]:
# BOW(Bag of Words)
# -- DictVectorizer

from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse = False)
D = [{'A':1, 'B':2}, {'B':3, 'C':1}]
X = v.fit_transform(D)
X

array([[ 1.,  2.,  0.],
       [ 0.,  3.,  1.]])

In [41]:
v.feature_names_

['A', 'B', 'C']

In [43]:
v.transform({'C':4, 'D':3})

array([[ 0.,  0.,  4.]])

In [44]:
# CountVecrtorizer
# 1. 문서를 토큰 리스트로 변환한다.
# 2. 각 문서에서 토큰의 출현 빈도를 센다.
# 3. 각 문서를 BOW 인코딩 벡터로 변환한다.

from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?'
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'last': 4,
 'one': 5,
 'second': 6,
 'the': 7,
 'third': 8,
 'this': 9}

In [45]:
vect.transform(['This is the second document.']).toarray()

array([[0, 1, 0, 1, 0, 0, 1, 1, 0, 1]], dtype=int64)

In [46]:
vect.transform(['something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [48]:
vect.transform(corpus).toarray()

array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [49]:
# Stop Words (문서에서 단어장 생성시 무시할 수 있는 단어를 말함)
vect = CountVectorizer(stop_words=["and","is","the","this"]).fit(corpus)
vect.vocabulary_

{'document': 0, 'first': 1, 'last': 2, 'one': 3, 'second': 4, 'third': 5}

In [50]:
vect = CountVectorizer(stop_words="english").fit(corpus)
vect.vocabulary_

{'document': 0, 'second': 1}

In [51]:
# 토큰

vect = CountVectorizer(analyzer="char").fit(corpus)
vect.vocabulary_

{' ': 0,
 '.': 1,
 '?': 2,
 'a': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'h': 8,
 'i': 9,
 'l': 10,
 'm': 11,
 'n': 12,
 'o': 13,
 'r': 14,
 's': 15,
 't': 16,
 'u': 17}

In [52]:
vect = CountVectorizer(token_pattern="t\w+").fit(corpus)
vect.vocabulary_

{'the': 0, 'third': 1, 'this': 2}

In [54]:
import nltk
vect = CountVectorizer(tokenizer=nltk.word_tokenize).fit(corpus)
vect.vocabulary_

{'.': 0,
 '?': 1,
 'and': 2,
 'document': 3,
 'first': 4,
 'is': 5,
 'last': 6,
 'one': 7,
 'second': 8,
 'the': 9,
 'third': 10,
 'this': 11}

In [56]:
# n그램 (단어장 생성에 사용할 토큰의 크기를 결정한다.)
vect = CountVectorizer(ngram_range=(2,2)).fit(corpus)
vect.vocabulary_

{'and the': 0,
 'first document': 1,
 'is the': 2,
 'is this': 3,
 'last document': 4,
 'second document': 5,
 'second second': 6,
 'the first': 7,
 'the last': 8,
 'the second': 9,
 'the third': 10,
 'third one': 11,
 'this is': 12,
 'this the': 13}

In [57]:
vect = CountVectorizer(ngram_range=(1,2), token_pattern = "t\w+").fit(corpus)
vect.vocabulary_

{'the': 0, 'the third': 1, 'third': 2, 'this': 3, 'this the': 4}

In [58]:
# 빈도수
vect = CountVectorizer(max_df=4, min_df=2).fit(corpus)
vect.vocabulary_, vect.stop_words_

({'document': 0, 'first': 1, 'is': 2, 'this': 3},
 {'and', 'last', 'one', 'second', 'the', 'third'})

In [59]:
vect.transform(corpus).toarray().sum(axis=0)

array([4, 2, 3, 3], dtype=int64)

In [61]:
# TF-IDF(Term Frequency – Inverse Document Frequency) 

from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()

array([[ 0.        ,  0.38947624,  0.55775063,  0.4629834 ,  0.        ,
         0.        ,  0.        ,  0.32941651,  0.        ,  0.4629834 ],
       [ 0.        ,  0.24151532,  0.        ,  0.28709733,  0.        ,
         0.        ,  0.85737594,  0.20427211,  0.        ,  0.28709733],
       [ 0.55666851,  0.        ,  0.        ,  0.        ,  0.        ,
         0.55666851,  0.        ,  0.26525553,  0.55666851,  0.        ],
       [ 0.        ,  0.38947624,  0.55775063,  0.4629834 ,  0.        ,
         0.        ,  0.        ,  0.32941651,  0.        ,  0.4629834 ],
       [ 0.        ,  0.45333103,  0.        ,  0.        ,  0.80465933,
         0.        ,  0.        ,  0.38342448,  0.        ,  0.        ]])

In [62]:
# Hashing trick
# --해시 함수를 사용하여 단어에 대한 인덱스 번호를 생성하기 때문에 메모리 및 실행 시간을 줄일 수 있다.

from sklearn.datasets import fetch_20newsgroups
twenty = fetch_20newsgroups()
len(twenty.data)
# 주의... 이거 시간좀 걸린다.

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


11314

In [63]:
%time CountVectorizer().fit(twenty.data).transform(twenty.data);

Wall time: 7.47 s


<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [64]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)

In [65]:
%time hv.transform(twenty.data);

Wall time: 3.14 s


<11314x10 sparse matrix of type '<class 'numpy.float64'>'
	with 112863 stored elements in Compressed Sparse Row format>

### Example

In [66]:
from urllib.request import urlopen
import json
import string
from konlpy.utils import pprint
from konlpy.tag import Hannanum
hannanum = Hannanum()

f = urlopen("https://www.datascienceschool.net/download-notebook/708e711429a646818b9dcbb581e0c10a/")
json = json.loads(f.read())
cell = ["\n".join(c["source"]) for c in json["cells"] if c["cell_type"] == "markdown"]
docs = [w for w in hannanum.nouns(" ".join(cell)) if ((not w[0].isnumeric()) and (w[0] not in string.punctuation))]

ModuleNotFoundError: No module named 'jpype'

In [67]:
vect = CountVectorizer().fit(docs)
count = vect.transform(docs).toarray().sum(axis=0)
idx = np.argsort(-count)
count = count[idx]
feature_name = np.array(vect.get_feature_names())[idx]
plt.bar(range(len(count)), count)
plt.show()


NameError: name 'docs' is not defined

In [68]:
pprint(list(zip(feature_name, count)))

NameError: name 'feature_name' is not defined