# LDA (Latent Dirichlet Allocation)
주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지에 대한 확률 모형

## Latent Dirichlet Allocation (LDA) with Python ... review
https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

In [45]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []


In [46]:
# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [47]:
print(doc_a)
print(texts[0])

Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.
['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [48]:
print(doc_b)
print(texts[1])

My mother spends a lot of time driving my brother around to baseball practice.
['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic']


In [49]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

corpora.Dictionary를 하면 어떻게 될까?

결과가 궁금하잖아요?

### corpora.dictionary – Construct word<->id mappings
https://radimrehurek.com/gensim/corpora/dictionary.html

In [50]:
[d for d in dictionary.items()]

[(6, 'lot'),
 (13, 'suggest'),
 (10, 'practic'),
 (0, 'brother'),
 (21, 'blood'),
 (15, 'tension'),
 (4, 'eat'),
 (11, 'basebal'),
 (30, 'say'),
 (22, 'seem'),
 (8, 'drive'),
 (25, 'well'),
 (2, 'like'),
 (24, 'often'),
 (17, 'increas'),
 (23, 'feel'),
 (16, 'health'),
 (9, 'around'),
 (19, 'pressur'),
 (28, 'school'),
 (12, 'spend'),
 (20, 'expert'),
 (7, 'time'),
 (18, 'may'),
 (14, 'caus'),
 (29, 'perform'),
 (1, 'brocolli'),
 (5, 'mother'),
 (27, 'never'),
 (26, 'better'),
 (3, 'good'),
 (31, 'profession')]

In [51]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [52]:
print(corpus[0])

[(0, 1), (1, 2), (2, 1), (3, 2), (4, 2), (5, 1)]


In [53]:
print(texts[0])

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


이런 느낌이겠네요

(brother - 1), (eat - 2), (like - 1), ...

In [54]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [55]:
ldamodel.show_topics()

[(0,
  '0.086*"brocolli" + 0.086*"good" + 0.085*"health" + 0.061*"eat" + 0.037*"caus" + 0.037*"blood" + 0.037*"expert" + 0.037*"suggest" + 0.037*"increas" + 0.037*"tension"'),
 (1,
  '0.068*"mother" + 0.068*"brother" + 0.068*"drive" + 0.041*"pressur" + 0.040*"seem" + 0.040*"school" + 0.040*"never" + 0.040*"better" + 0.040*"feel" + 0.040*"well"')]

In [56]:
ldamodel.get_term_topics('drive')

[(0, 0.025045144791897074), (1, 0.055808155820715681)]

In [57]:
ldamodel.get_topic_terms(0)

[(1, 0.085579683223977285),
 (3, 0.085574378399249335),
 (16, 0.08509070446042967),
 (4, 0.061190311662440787),
 (14, 0.036692281986397228),
 (21, 0.036689721445841358),
 (20, 0.03668840960026757),
 (13, 0.036686753140169795),
 (17, 0.036685157780891556),
 (15, 0.036684294058133428)]

In [58]:
len(dictionary.items())

32

In [59]:
ldamodel.get_topic_terms(0, 32)

[(1, 0.085579683223977285),
 (3, 0.085574378399249335),
 (16, 0.08509070446042967),
 (4, 0.061190311662440787),
 (14, 0.036692281986397228),
 (21, 0.036689721445841358),
 (20, 0.03668840960026757),
 (13, 0.036686753140169795),
 (17, 0.036685157780891556),
 (15, 0.036684294058133428),
 (18, 0.036682845501099091),
 (2, 0.036677474204594378),
 (19, 0.036544783158643693),
 (31, 0.036221908494992391),
 (30, 0.03621331528634688),
 (8, 0.036063797184707098),
 (0, 0.035824499545712214),
 (5, 0.03580616669728437),
 (7, 0.012320027979602062),
 (10, 0.012319447243565613),
 (6, 0.012319323989188662),
 (12, 0.0123189031015627),
 (11, 0.012318754934455238),
 (9, 0.01231866701230932),
 (24, 0.012313344409831038),
 (29, 0.012311102140010105),
 (25, 0.01231102875054267),
 (23, 0.01231091688037461),
 (26, 0.012310811339833123),
 (27, 0.012310705680216425),
 (28, 0.012310407817150417),
 (22, 0.01231007289017986)]

#### get_topic_terms로 나온 단어의 수치는 모두 더하면 1. 보정이 들어간걸 아닐까 짐작...

In [60]:
ldamodel.print_topic(0)

'0.086*"brocolli" + 0.086*"good" + 0.085*"health" + 0.061*"eat" + 0.037*"caus" + 0.037*"blood" + 0.037*"expert" + 0.037*"suggest" + 0.037*"increas" + 0.037*"tension"'

In [61]:
corpus[0]

[(0, 1), (1, 2), (2, 1), (3, 2), (4, 2), (5, 1)]

In [62]:
texts[0]

['brocolli',
 'good',
 'eat',
 'brother',
 'like',
 'eat',
 'good',
 'brocolli',
 'mother']

In [63]:
doc_a

'Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.'

In [64]:
ldamodel[corpus[0]]

[(0, 0.93725599057037667), (1, 0.062744009429623299)]

In [65]:
print(corpus[0])
print(texts[0])

[(0, 1), (1, 2), (2, 1), (3, 2), (4, 2), (5, 1)]
['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [66]:
[ldamodel[c] for c in corpus]

[[(0, 0.93725173274028184), (1, 0.062748267259718132)],
 [(0, 0.053889343446389182), (1, 0.94611065655361082)],
 [(0, 0.94629628296338597), (1, 0.053703717036614036)],
 [(0, 0.0420481760448752), (1, 0.95795182395512479)],
 [(0, 0.92655632658998055), (1, 0.073443673410019422)]]

#### 얘네들도 다 더하면 1...

음..

임의의 새로운 문장으로 테스트를 해보고 싶다...

In [67]:
test = 'i like eat brocolli'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.87146651527326535), (1, 0.12853348472673468)]

In [68]:
ldamodel.print_topic(1)

'0.068*"mother" + 0.068*"brother" + 0.068*"drive" + 0.041*"pressur" + 0.040*"seem" + 0.040*"school" + 0.040*"never" + 0.040*"better" + 0.040*"feel" + 0.040*"well"'

In [69]:
for s in stemmed_tokens:
    print(ldamodel.get_term_topics(s))

[(0, 0.025647010412016405)]
[(0, 0.050038331126688554)]
[(0, 0.074569584621203375)]


In [70]:
test = 'the quick brown fox jumps over the lazy dog choker'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.5), (1, 0.5)]

In [71]:
stemmed_tokens

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'choker']

In [22]:
dictionary.doc2bow(stemmed_tokens)

[]

기존의 사전에 test 문장을 함께 넣어서 돌려볼 순 없을까?

In [72]:
corpus = dictionary.doc2bow(stemmed_tokens, True)

In [73]:
corpus

[(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]

In [74]:
len(dictionary.items())

39

In [75]:
[d for d in dictionary.items()]

[(6, 'lot'),
 (13, 'suggest'),
 (10, 'practic'),
 (0, 'brother'),
 (21, 'blood'),
 (15, 'tension'),
 (4, 'eat'),
 (11, 'basebal'),
 (30, 'say'),
 (22, 'seem'),
 (8, 'drive'),
 (37, 'lazi'),
 (32, 'brown'),
 (25, 'well'),
 (2, 'like'),
 (24, 'often'),
 (17, 'increas'),
 (38, 'choker'),
 (23, 'feel'),
 (16, 'health'),
 (35, 'dog'),
 (34, 'jump'),
 (9, 'around'),
 (19, 'pressur'),
 (28, 'school'),
 (12, 'spend'),
 (20, 'expert'),
 (7, 'time'),
 (18, 'may'),
 (14, 'caus'),
 (29, 'perform'),
 (33, 'quick'),
 (1, 'brocolli'),
 (5, 'mother'),
 (27, 'never'),
 (26, 'better'),
 (36, 'fox'),
 (3, 'good'),
 (31, 'profession')]

In [76]:
corpus.append(dictionary.doc2bow(stemmed_tokens))

In [77]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

TypeError: 'int' object is not iterable

In [78]:
corpus[-1]

[(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]

In [30]:
corpus

[[(0, 1), (1, 2), (2, 1), (3, 2), (4, 2), (5, 1)],
 [(0, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(0, 1),
  (5, 1),
  (8, 1),
  (19, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(1, 1), (3, 1), (16, 2), (30, 1), (31, 1)],
 [(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]]

In [32]:
ldamodel[corpus[5]]

[(0, 0.067383135323088775), (1, 0.93261686467691129)]

In [33]:
[ldamodel[c] for c in corpus]

[[(0, 0.94496553585273335), (1, 0.055034464147266537)],
 [(0, 0.056663698982859764), (1, 0.94333630101714028)],
 [(0, 0.056138887395792821), (1, 0.94386111260420724)],
 [(0, 0.042829049640003551), (1, 0.95717095035999644)],
 [(0, 0.92567659933095392), (1, 0.074323400669046136)],
 [(0, 0.06738116854813378), (1, 0.93261883145186619)]]