In [1]:
from tqdm import tqdm_notebook
from konlpy.tag import Mecab
import string
import warnings
from gensim import corpora
from gensim import models

import numpy as np
import re
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
def read_document(input_file_name):
    '''리스트로 변환'''
    corpus = list()
    
    with open(input_file_name, 'rb') as f:
        temp_corpus = pickle.load(f)
        
    for page in temp_corpus:
        corpus += page
    
    return corpus

def text_cleaning(docs):
    '''한국어를 제외한 글자를 제거하는 함수'''
    cleaned_docs = list()
    
    for doc in docs:
        temp_doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
        cleaned_docs.append(temp_doc)
        
    return cleaned_docs

def define_stopwords(path):
    SW = set()
    
    for i in string.punctuation:
        SW.add(i)
    
    with open(path) as f:
        for word in f:
            SW.add(word)
            
    return SW

def text_tokenizing(corpus, tokenizer):
    
    mecab = Mecab()
    token_corpus = list()
    
    if tokenizer == 'noun':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = mecab.nouns(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word)>1]
            
            token_corpust.append(token_text)
            
    elif tokenizer == 'morph':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = mecab.morphs(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word)>1]
            
            token_corpust.append(token_text)
        
    elif tokenizer == 'word':
        for n in tqdm_notebook(range(len(corpus)), desc='Preprocessing'):
            token_text = corpus[n].split()
            token_text = [word for word in token_text if word not in SW and len(word)>1]
            
            token_corpust.append(token_text)
    
    
    return token_corpus

input_file_name = '/naver_news_content.pk'
documents = read_documents(input_file_name)
SW = define_stopwords('/stopwords-ko.txt')
cleaned_text = text_cleaning(documents)
tokenized_text = text_tokenizing(cleaned_text, tokenizer = 'noun')

In [None]:
tokenized_text

#### 토픽 모델링에 사용할 함수들 확인하기

In [None]:
# 문서-단어 행렬만들기
# 어휘(vocabulary) 학습
dictionary = corpora.Dictionary(tokenized_text)

# 문서-단어 행렬(document-term matrix) 생성
corpus = [dictionary.doc2bow(text) for text n tokenized_text]

In [None]:
# Dictionary 확인
print(dictionary)

In [None]:
# corpus 확인
corpus[0][:5]

In [None]:
# tfidf 문서-단어 행렬 생성
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

print(corpus_tfidf[0][:5])

In [None]:
# lda model 만들기
model = models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary)

In [None]:
# lda 결과 확인
model.show_topid(topic_num = 0, n_top = 10)

#### 토픽 모델링을 추가하여 코드 완성하기

In [None]:
# 토픽 개수, 키워드 개수를 정해주는 변수 추가
NUM_TOPICS = 3
NUM_TOPIC_WORDS = 30

def build_doc_term_mat(documents):
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]
    
    return corpus, dictionary

def print_topic_word(model):
    
    for topic_id in range(model.num_topics):
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print('Topic ID: {}'.format(topic_id))
        
        for topic_word, prob in topic_word_probs:
            print('\t{}\t{}'.format(topic_word, prob))
    
        print('\n')
        
    return None

# dictionary-term matrix 만들기
corpus, dictionary = build_doc_term_mat(tokenized_text)
# LDA 실행
model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha = 'auto', eta='auto')
# 결과 출력
print_topic_word(model)

#### pyLDAvis를 통한 토픽 모델링 결과 시각화하기

In [2]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
Collecting joblib>=0.8.4
  Downloading https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl (294kB)
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py): started
  Building wheel for pyLDAvis (setup.py): finished with status 'done'
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97717 sha256=019661e0c0478a57fc80ec7f1217e11e306988aeccfecc22370d08d6fd8fb920
  Stored in directory: C:\Users\민아영\AppData\Local\pip\Cache\wheels\98\71\24\513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Building whee

In [None]:
# pyLDAvis 불러오기
import pyLDAvis
import pyLDAvis.gensim

# pyLDAvis를 jupyter notebook에서 실행할 수 있게 활성화
pyLDAvis.enable_notebook()

# pyLDAvis 실행
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data

#### NSMC 불러와서 정제하기

In [None]:
def read_documents(filename):
    with open(filename, encoding = 'utf-8') as f:
        documents = [line.split('\t') for line in f.read().splitlines()]
        
        
    
    return documents

train_docs = read_documents('ratings_train.txt')
test_docs  = read_documents('ratings_test.txt')

In [None]:
print(len(train_docs))
print(len(test_docs))

In [None]:
def text_cleaning(docs):
    '''한국어를 제외한 글자를 제거하는 함수'''
    cleaned_docs = list()
    
    for doc in docs:
        temp_doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
        cleaned_docs.append(temp_doc)
        
    return cleaned_docs

def define_stopwords(path):
    SW = set()
    
    for i in string.punctuation:
        SW.add(i)
    
    with open(path) as f:
        for word in f:
            SW.add(word)
            
    return SW