# 데이터 불러오기

In [1]:
import numpy

In [2]:
tdm = numpy.load('tdm_small.npy')

In [3]:
tdm = tdm.tolist()

In [4]:
tdm

<5347x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 652672 stored elements in Compressed Sparse Column format>

In [5]:
with open('words_small.txt', encoding='utf8') as f:
    words = f.read().splitlines()

# gensim 포맷으로 바꾸기

In [6]:
from gensim.matutils import Sparse2Corpus

In [7]:
corpus = Sparse2Corpus(tdm.T)

In [8]:
corpus

<gensim.matutils.Sparse2Corpus at 0x140f87788d0>

# Latent Dirichlet Allocation

[LdaModel 문서](https://radimrehurek.com/gensim/models/ldamodel.html)

In [9]:
from gensim.models.ldamodel import LdaModel

In [10]:
lda = LdaModel(corpus=corpus, num_topics=100, id2word=dict(enumerate(words)))



# Topic 보기

In [11]:
lda.show_topic(0)

[('스마트폰', 0.042452875495726509),
 ('기능', 0.032534346829516544),
 ('카메라', 0.028653800755759858),
 ('배터리', 0.020069203905506705),
 ('아이폰', 0.019579249382138866),
 ('촬영', 0.018935981378525327),
 ('탑재', 0.018715866072837293),
 ('충전', 0.016290749972750582),
 ('화면', 0.015666593168371452),
 ('출시', 0.014099861505872885)]

# 문서의 topic 확인

## 문서 변환

In [12]:
doc = [(i, tdm[0, i]) for i in range(1000)]

## 문서에서 가장 많이 쓰인 단어들 보기

In [13]:
from operator import itemgetter

In [14]:
doc_words = [(words[i], n) for i, n in doc]

In [15]:
sorted(doc_words, key=itemgetter(1), reverse=True)[:10]

[('게임', 37),
 ('이벤트', 31),
 ('아이템', 26),
 ('진행', 16),
 ('제공', 13),
 ('접속', 9),
 ('최대', 8),
 ('동안', 8),
 ('지급', 8),
 ('이상', 7)]

## 문서의 topic

In [16]:
lda.get_document_topics(doc)

[(5, 0.068525550876230412),
 (64, 0.018509048296726751),
 (81, 0.080450911280735352),
 (97, 0.82655401188529598)]

In [19]:
lda.show_topic(97)

[('이벤트', 0.058251655862943612),
 ('진행', 0.029977027851854299),
 ('아이템', 0.024603099846347086),
 ('이용자', 0.018033304024576691),
 ('홈페이지', 0.017683767204613808),
 ('증정', 0.016989031303940368),
 ('레벨', 0.013084670021021368),
 ('게임', 0.012549454502525155),
 ('이번', 0.012209486740002965),
 ('신규', 0.011969554498051534)]

In [20]:
lda.show_topic(81)

[('게임', 0.12330254514489764),
 ('순위', 0.023298899253070285),
 ('무료', 0.017076709820247202),
 ('캐릭터', 0.016855596799610541),
 ('출시', 0.015186933976921214),
 ('인기', 0.01317232086962767),
 ('부문', 0.012740983173297752),
 ('유저', 0.012000747237174815),
 ('차지', 0.011345821678336372),
 ('스토리', 0.011212868544093632)]

# LDA 모델 저장하기

In [21]:
lda.save('20160507.lda')

# LDA 모델 불러오기

In [22]:
lda = LdaModel.load('20160507.lda')

In [23]:
lda.show_topic(0)

[('스마트폰', 0.042452875495726509),
 ('기능', 0.032534346829516544),
 ('카메라', 0.028653800755759858),
 ('배터리', 0.020069203905506705),
 ('아이폰', 0.019579249382138866),
 ('촬영', 0.018935981378525327),
 ('탑재', 0.018715866072837293),
 ('충전', 0.016290749972750582),
 ('화면', 0.015666593168371452),
 ('출시', 0.014099861505872885)]