# LDA (Latent Dirichlet Allocation)
주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지에 대한 확률 모형

## Latent Dirichlet Allocation (LDA) with Python ... review
https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

In [2]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []


In [3]:
# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [4]:
print(texts[0])

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [5]:
print(texts[1])

['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic']


In [67]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

corpora.Dictionary를 하면 어떻게 될까?

결과가 궁금하잖아요?

### corpora.dictionary – Construct word<->id mappings
https://radimrehurek.com/gensim/corpora/dictionary.html

In [90]:
[d for d in dictionary.items()]

[(5, 'mother'),
 (0, 'like'),
 (3, 'eat'),
 (26, 'feel'),
 (11, 'practic'),
 (19, 'blood'),
 (12, 'spend'),
 (16, 'expert'),
 (30, 'say'),
 (14, 'may'),
 (8, 'time'),
 (24, 'often'),
 (4, 'good'),
 (22, 'perform'),
 (31, 'profession'),
 (18, 'health'),
 (13, 'pressur'),
 (1, 'brother'),
 (2, 'brocolli'),
 (10, 'lot'),
 (27, 'well'),
 (21, 'increas'),
 (6, 'drive'),
 (23, 'school'),
 (7, 'basebal'),
 (15, 'suggest'),
 (28, 'better'),
 (9, 'around'),
 (17, 'tension'),
 (25, 'seem'),
 (29, 'never'),
 (20, 'caus')]

In [10]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [92]:
print(corpus[0])

[(0, 1), (1, 1), (2, 2), (3, 2), (4, 2), (5, 1)]


In [12]:
print(texts[0])

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


이런 느낌이겠네요

(like - 1), (brother - 1), (brocolli - 2), ...

In [107]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [108]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.058*"brocolli" + 0.058*"good" + 0.057*"mother"'), (1, '0.074*"health" + 0.045*"pressur" + 0.044*"drive"')]


### 참고로 ldamodel를 사용할때마다 결과가 달라요...

그래서 이 밑에도 매번 달라요...

### models.ldamodel – Latent Dirichlet Allocation
https://radimrehurek.com/gensim/models/ldamodel.html

In [57]:
ldamodel.get_term_topics(0)

[(0, 0.019716285673529226)]

In [58]:
ldamodel.get_term_topics('like')

[(0, 0.019716285673529226)]

In [93]:
ldamodel.get_term_topics('good')

[(0, 0.057498119807046166)]

In [80]:
ldamodel.get_term_topics(4)

[(0, 0.057498119807046166)]

### 저는 get_term_ropics에서 나오는 결과와 print_topics로 나오는 결과가 같을 줄 알았어요.

In [81]:
ldamodel.get_topic_terms(0)

[(4, 0.066167506523890821),
 (2, 0.066166423202059232),
 (18, 0.066160493327345027),
 (13, 0.047241145589364181),
 (3, 0.047231029040298987),
 (6, 0.046898627821264922),
 (5, 0.046873646654270983),
 (1, 0.046869765381129726),
 (23, 0.02831345916223307),
 (25, 0.028313237617031993)]

In [29]:
len(dictionary.items())

32

In [85]:
ldamodel.get_topic_terms(0, 32)

[(4, 0.066167506523890821),
 (2, 0.066166423202059232),
 (18, 0.066160493327345027),
 (13, 0.047241145589364181),
 (3, 0.047231029040298987),
 (6, 0.046898627821264922),
 (5, 0.046873646654270983),
 (1, 0.046869765381129726),
 (23, 0.02831345916223307),
 (25, 0.028313237617031993),
 (28, 0.028313233175789976),
 (27, 0.0283131807788474),
 (26, 0.028313168743974312),
 (24, 0.02831262685919838),
 (22, 0.028312387000101014),
 (29, 0.028311956167798611),
 (14, 0.028297799292713754),
 (17, 0.028296543968432359),
 (16, 0.028295786511430283),
 (19, 0.028295008377450519),
 (0, 0.028294632195803134),
 (21, 0.028293595394571797),
 (20, 0.028292767304627191),
 (15, 0.028292763325622972),
 (30, 0.028267903678537074),
 (31, 0.028267707278266666),
 (9, 0.0094991239283826953),
 (7, 0.0094989906818570731),
 (8, 0.0094989372184822052),
 (12, 0.0094988804743121741),
 (11, 0.0094988736788426745),
 (10, 0.0094987996460688204)]

#### get_topic_terms로 나온 단어의 수치는 모두 더하면 1. 보정이 들어간걸 아닐까 짐작...

In [86]:
ldamodel.print_topic(0)

'0.066*"good" + 0.066*"brocolli" + 0.066*"health" + 0.047*"pressur" + 0.047*"eat" + 0.047*"drive" + 0.047*"mother" + 0.047*"brother" + 0.028*"school" + 0.028*"seem"'

In [87]:
ldamodel[corpus[0]]

[(0, 0.94317242265993106), (1, 0.056827577340068915)]

In [89]:
[ldamodel[c] for c in corpus]

[[(0, 0.94317058299760514), (1, 0.05682941700239489)],
 [(0, 0.05624642575151062), (1, 0.94375357424848938)],
 [(0, 0.94867615728035593), (1, 0.051323842719644054)],
 [(0, 0.95321989050939382), (1, 0.046780109490606153)],
 [(0, 0.92474622253430605), (1, 0.075253777465693991)]]

#### 얘네들도 다 더하면 1...

무튼

임의의 새로운 문장으로 테스트를 해보고 싶다...

In [109]:
test = 'i like eat'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.82634397257500436), (1, 0.17365602742499564)]

In [110]:
test = 'the quick brown fox jumps over the lazy dog choker'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.5), (1, 0.5)]

In [112]:
dictionary.doc2bow(stemmed_tokens)

[]