<a href="https://colab.research.google.com/github/bjh5098/Social-Network-Analysis-and-Text-Mining/blob/master/Textmining_10_Korean_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#한국어 텍스트 마이닝 - 기초

##Colab에서 Mecab-ko 설치: https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [0]:
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [0]:
cd Mecab-ko-for-Google-Colab/

In [0]:
! bash install_mecab-ko_on_colab190912.sh

###MeCab 활용

In [0]:
import MeCab
m = MeCab.Tagger()
sentence = '데이터 과학 이상의 것'
result = m.parse(sentence)
print(result)

In [0]:
from konlpy.tag import Mecab
mecab = Mecab()
sentence = '데이터 과학 이상의 것'
result_pos = mecab.pos(sentence)
print(result_pos)

In [0]:
result_mor = mecab.morphs(sentence)
print(result_mor)

In [0]:
result_noun = mecab.nouns(sentence)
print(result_noun)

***Mecab 품사 태그 - https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265***

##Konlpy & 오픈 소스 한국어 처리기(Open-source Korean Text Processor)

***출처: https://konlpy-ko.readthedocs.io/ko/v0.4.3/api/konlpy.tag/***

In [0]:
pip install konlpy

In [0]:
from konlpy.tag import Okt
okt = Okt()

In [0]:
okt_result = okt.morphs(sentence)
print(okt_result)

###어간 추출

In [0]:
sentence_ko = '괜찮네요오랜만포켓몬스터잼밌어요'
okt_morphs_result = okt.morphs(sentence_ko, stem=False)
print(okt_morphs_result)

In [0]:
sentence_ko = '괜찮네요오랜만포켓몬스터잼밌어요'
okt_morphs_result = okt.morphs(sentence_ko, stem=True)
print(okt_morphs_result)

###품사 태깅 및 선별

In [0]:
sentence_ko = '괜찮네요오랜만포켓몬스터잼밌어요'
okt_pos_result = okt.pos(sentence_ko, stem=False)
print(okt_pos_result)

In [0]:
sentence_ko = '괜찮네요오랜만포켓몬스터잼밌어요'
okt_pos_result = okt.pos(sentence_ko, stem=True)
print(okt_pos_result)

In [0]:
clean_words = []
for word in okt_pos_result:
    if word[1] == 'Noun' or word[1] == 'Verb' or word[1] == 'Adjective':
        clean_words.append(word[0])

In [0]:
clean_words

In [0]:
document = ' '.join(clean_words)

In [0]:
print(document)

###불용어 처리

한국어 불용어: https://www.ranks.nl/stopwords/korean

In [0]:
sentence_ko_n = '나 너 괜찮네요오랜만포켓몬스터잼밌어요.!'

In [0]:
stop_words = ['나', '너', '.', '!']

In [0]:
okt_pos_result_n = okt.pos(sentence_ko_n, stem=False)
print(okt_pos_result_n)

In [0]:
clean_words = []
for word in okt_pos_result_n:
    if word[1] == 'Noun' or word[1] == 'Verb' or word[1] == 'Adjective':
        clean_words.append(word[0])

In [0]:
clean_words

In [0]:
document = []
for word in clean_words:
    if word not in stop_words: 
        document.append(word) 

In [0]:
document

###문서 특징 추출하기

####데이터 불러오기

In [0]:
import pandas as pd

In [0]:
naver_movies = pd.read_csv('/content/naver_movie_reviews_sub.txt', sep='\t', encoding='cp949', low_memory=False)

In [0]:
naver_movies.head(5)

In [0]:
naver_movies.info()

In [0]:
naver_movies.describe()

####리뷰 필드 선택

In [0]:
reviews = naver_movies['document'].values.tolist()

In [0]:
reviews

In [0]:
new_doc = []
for i, document in enumerate(reviews):
    okt = Okt()
    clean_words = okt.nouns(document) #명사 추출
    document = ' '.join(clean_words)
    new_doc.append(document)

In [0]:
new_doc

In [0]:
new_doc

####Documents-Terms Matrix - Term Frequency

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer()
vectorizer.fit(new_doc)

In [0]:
dtm = vectorizer.transform(new_doc)

In [0]:
print(dtm.shape)

In [0]:
print('fit_transform, (Document {}, feature {})'.format(dtm.shape[0], dtm.shape[1]))

In [0]:
dtm.toarray()

In [0]:
dtm.todense()

In [0]:
terms = vectorizer.get_feature_names()

In [0]:
terms

####Document-Term Matrix - Tf-Idf

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
vectorizer = TfidfVectorizer(use_idf=True)

In [0]:
dtm_tfidf = vectorizer.fit_transform(new_doc)

In [0]:
print(dtm_tfidf.shape)

In [0]:
print('fit_transform, (Document {}, feature {})'.format(dtm_tfidf.shape[0], dtm_tfidf.shape[1]))

In [0]:
print(dtm_tfidf)

In [0]:
dtm_tfidf.toarray()[2]

In [0]:
dtm_tfidf.toarray()[2]

In [0]:
terms = vectorizer.get_feature_names()

In [0]:
terms

###문서 군집화

In [0]:
labels = naver_movies['label'].values.tolist()

In [0]:
import numpy as np
true_k = len(np.unique(labels))
true_k

In [0]:
from sklearn.cluster import KMeans

In [0]:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
km.fit(dtm)

In [0]:
km.n_clusters

In [0]:
order_centroids = km.cluster_centers_.argsort()

In [0]:
terms = vectorizer.get_feature_names()

In [0]:
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

###문서 분류

In [0]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
naive_model = gnb.fit(dtm.toarray(), labels)

In [0]:
naive_y_hat = naive_model.predict(X.toarray())

In [0]:
naive_y_hat

####성능평가

In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [0]:
print("나이브 베이즈 정확도:", accuracy_score(labels, naive_y_hat)) #예측값과 실제값 비교

In [0]:
print(classification_report(labels, naive_y_hat, labels=[1, 0], target_names=['긍정','부정']))

In [0]:
from sklearn.metrics import confusion_matrix

In [0]:
confusion_matrix(labels, naive_y_hat, labels=[1,0])