# 1. Preprocessing

1. Load documents

English

In [1]:
from nltk.corpus import reuters
docs_en = [reuters.words(i) for i in reuters.fileids()]

Korean

In [6]:
from konlpy.corpus import kobill
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]

2. Tokenize

English

In [7]:
texts_en = docs_en # because we loaded tokenized documents in step 1
print(texts_en[0])

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]


Korean

In [8]:
from konlpy.tag import Twitter; t = Twitter()
pos = lambda d: ['/'.join(p) for p in t.pos(d, stem=True, norm=True)]
texts_ko = [pos(doc) for doc in docs_ko]
print(texts_ko[0])

['지방공무원법/Noun', '일부/Noun', '개정/Noun', '법률/Noun', '안/Noun', '\n\n/Foreign', '(/Punctuation', '정의화/Noun', '의원/Noun', '대표/Noun', '발의/Noun', ')/Punctuation', '\n\n /Foreign', '의/Noun', '안/Noun', '\n /Foreign', '번/Noun', '호/Noun', '\n\n/Foreign', '9890/Number', '\n\n/Foreign', '발의/Noun', '연월일/Noun', ':/Punctuation', '2010/Number', './Punctuation', '11/Number', './Punctuation', '12/Number', './Punctuation', '발/Noun', '의/Noun', '자/Noun', ':/Punctuation', '정의화/Noun', '․/Foreign', '이명수/Noun', '․/Foreign', '김을동/Noun', '이사철/Noun', '․/Foreign', '여상규/Noun', '․/Foreign', '안규백/Noun', '\n\n/Foreign', '황영철/Noun', '․/Foreign', '박영아/Noun', '․/Foreign', '김정훈/Noun', '\n\n/Foreign', '김학송/Noun', '의원/Noun', '(/Punctuation', '10/Number', '인/Noun', ')/Punctuation', '\n\n/Foreign', '제안/Noun', '이유/Noun', '및/Noun', '주요/Modifier', '내용/Noun', '\n\n  /Foreign', '초등학교/Noun', '저학년/Noun', '의/Josa', '경우/Noun', '에도/Josa', '부모/Noun', '의/Josa', '따뜻하다/Adjective', '사랑/Noun', '과/Josa', '보살피다/Verb', '필요/Noun', '\n\n/Foreign', '

3. Encode tokens to integers

English

In [9]:
from gensim import corpora
dictionary_en = corpora.Dictionary(texts_en)
dictionary_en.save('en.dict')  # save dictionary to file for future use

Korean

In [10]:
from gensim import corpora
dictionary_ko = corpora.Dictionary(texts_ko)
dictionary_ko.save('ko.dict')  # save dictionary to file for future use

4. Calculate TF-IDF

English

In [11]:
from gensim import models
tf_en = [dictionary_en.doc2bow(text) for text in texts_en]
tfidf_model_en = models.TfidfModel(tf_en)
tfidf_en = tfidf_model_en[tf_en]
corpora.MmCorpus.serialize('en.mm', tfidf_en) # save corpus to file for future use

# print first 10 elements of first document's tf-idf vector
print(tfidf_en.corpus[0][:10])
# print top 10 elements of first document's tf-idf vector
print(sorted(tfidf_en.corpus[0], key=lambda x: x[1], reverse=True)[:10])
# print token of most frequent element
print(dictionary_en.get(9))

[(0, 7), (1, 3), (2, 13), (3, 2), (4, 1), (5, 1), (6, 20), (7, 6), (8, 10), (9, 62)]
[(9, 62), (363, 32), (276, 30), (371, 26), (6, 20), (96, 19), (112, 19), (326, 16), (118, 14), (2, 13)]
.


Korean

In [12]:
from gensim import models
tf_ko = [dictionary_ko.doc2bow(text) for text in texts_ko]
tfidf_model_ko = models.TfidfModel(tf_ko)
tfidf_ko = tfidf_model_ko[tf_ko]
corpora.MmCorpus.serialize('ko.mm', tfidf_ko) # save corpus to file for future use

# print first 10 elements of first document's tf-idf vector
print(tfidf_ko.corpus[0][:10])
# print top 10 elements of first document's tf-idf vector
print(sorted(tfidf_ko.corpus[0], key=lambda x: x[1], reverse=True)[:10])
# print token of most frequent element
print(dictionary_ko.get(414))

[(0, 8), (1, 12), (2, 1), (3, 127), (4, 1), (5, 3), (6, 1), (7, 27), (8, 1), (9, 26)]
[(3, 127), (437, 60), (20, 49), (330, 38), (336, 37), (334, 28), (7, 27), (9, 26), (289, 22), (374, 22)]
초등학교/Noun


# 2. Train topic models

1. LDA

English

In [16]:
ntopics, nwords = 3, 5

In [17]:
import numpy as np; np.random.seed(42)  # optional
lda_en = models.ldamodel.LdaModel(tfidf_en, id2word=dictionary_en, num_topics=ntopics)
print(lda_en.print_topics(num_topics=ntopics, num_words=nwords))

[(0, '0.005*"the" + 0.003*"pct" + 0.003*"to" + 0.003*"in" + 0.003*"of"'), (1, '0.005*"cts" + 0.005*"Record" + 0.005*"div" + 0.005*"Pay" + 0.004*"Qtly"'), (2, '0.010*"vs" + 0.006*"000" + 0.006*"mln" + 0.005*"loss" + 0.005*"cts"')]


Korean

In [18]:
import numpy as np; np.random.seed(42)  # optional
lda_ko = models.ldamodel.LdaModel(tfidf_ko, id2word=dictionary_ko, num_topics=ntopics)
print(lda_ko.print_topics(num_topics=ntopics, num_words=nwords))

[(0, '0.003*"육아휴직/Noun" + 0.002*"만/Noun" + 0.001*"×/Foreign" + 0.001*"고용/Noun" + 0.001*"예고/Noun"'), (1, '0.002*"육아휴직/Noun" + 0.001*"손해/Noun" + 0.001*"사업자/Noun" + 0.001*"×/Foreign" + 0.001*"자녀/Noun"'), (2, '0.001*"파견/Noun" + 0.001*"부대/Noun" + 0.001*"결혼/Noun" + 0.001*"소말리아/Noun" + 0.001*"\n\n    /Foreign"')]


# 3. Scoring documents

English

In [20]:
bow = tfidf_model_en[dictionary_en.doc2bow(texts_en[0])]
sorted(lda_en[bow], key=lambda x: x[1], reverse=True)

[(2, 0.8003078), (0, 0.1792411), (1, 0.020451067)]

In [21]:
bow = tfidf_model_en[dictionary_en.doc2bow(texts_en[1])]
sorted(lda_en[bow], key=lambda x: x[1], reverse=True)

[(0, 0.7392981), (2, 0.21646468), (1, 0.04423724)]

Korean

In [22]:
bow = tfidf_model_ko[dictionary_ko.doc2bow(texts_ko[0])]
sorted(lda_ko[bow], key=lambda x: x[1], reverse=True)

[(0, 0.93739796), (1, 0.03285161), (2, 0.02975037)]

In [23]:
bow = tfidf_model_ko[dictionary_ko.doc2bow(texts_ko[1])]
sorted(lda_ko[bow], key=lambda x: x[1], reverse=True)

[(0, 0.93692136), (1, 0.03301964), (2, 0.030058995)]