# LDA (Latent Dirichlet Allocation)
주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지에 대한 확률 모형

## Latent Dirichlet Allocation (LDA) with Python ... review
https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

In [1]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []




In [2]:
# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [3]:
print(doc_a)
print(texts[0])

Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.
['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [4]:
print(doc_b)
print(texts[1])

My mother spends a lot of time driving my brother around to baseball practice.
['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic']


In [5]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

corpora.Dictionary를 하면 어떻게 될까?

결과가 궁금하잖아요?

### corpora.dictionary – Construct word<->id mappings
https://radimrehurek.com/gensim/corpora/dictionary.html

In [6]:
[d for d in dictionary.items()]

[(14, 'pressur'),
 (2, 'brocolli'),
 (26, 'never'),
 (10, 'drive'),
 (4, 'eat'),
 (27, 'perform'),
 (5, 'mother'),
 (31, 'profession'),
 (18, 'health'),
 (11, 'practic'),
 (22, 'well'),
 (24, 'seem'),
 (12, 'lot'),
 (19, 'suggest'),
 (30, 'say'),
 (23, 'better'),
 (16, 'may'),
 (8, 'time'),
 (7, 'spend'),
 (20, 'increas'),
 (25, 'feel'),
 (1, 'like'),
 (28, 'often'),
 (0, 'brother'),
 (13, 'expert'),
 (15, 'caus'),
 (6, 'around'),
 (9, 'basebal'),
 (3, 'good'),
 (17, 'blood'),
 (29, 'school'),
 (21, 'tension')]

In [7]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [8]:
print(corpus[0])

[(0, 1), (1, 1), (2, 2), (3, 2), (4, 2), (5, 1)]


In [9]:
print(texts[0])

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


이런 느낌이겠네요

(brother - 1), (eat - 2), (like - 1), ...

In [10]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [11]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.068*"mother" + 0.068*"brother" + 0.068*"drive"'), (1, '0.086*"good" + 0.086*"brocolli" + 0.086*"health"')]


### 참고로 ldamodel를 사용할때마다 결과가 달라요...

그래서 이 밑에도 매번 달라요...

### models.ldamodel – Latent Dirichlet Allocation
https://radimrehurek.com/gensim/models/ldamodel.html

In [12]:
ldamodel.get_term_topics(0)

[(0, 0.056204679555064758), (1, 0.024720948224110938)]

In [13]:
ldamodel.get_term_topics('brother')

[(0, 0.056204679555064758), (1, 0.024720948224110938)]

In [14]:
ldamodel.get_term_topics('brocolli')

[(1, 0.074625352654320784)]

In [15]:
ldamodel.get_term_topics(3)

[(1, 0.074626532112599536)]

### 저는 get_term_ropics에서 나오는 결과와 print_topics로 나오는 결과가 같을 줄 알았어요.

In [16]:
ldamodel.get_topic_terms(0)

[(5, 0.068372181637743584),
 (0, 0.068370114993124273),
 (10, 0.06831588171140332),
 (14, 0.04078045314346522),
 (22, 0.040302113092391782),
 (26, 0.040302038799766078),
 (25, 0.040301899241261561),
 (28, 0.040301840379021611),
 (24, 0.04030173190041194),
 (23, 0.040301661532591238)]

In [17]:
len(dictionary.items())

32

In [18]:
ldamodel.get_topic_terms(0, 32)

[(5, 0.068372181637743584),
 (0, 0.068370114993124273),
 (10, 0.06831588171140332),
 (14, 0.04078045314346522),
 (22, 0.040302113092391782),
 (26, 0.040302038799766078),
 (25, 0.040301899241261561),
 (28, 0.040301840379021611),
 (24, 0.04030173190041194),
 (23, 0.040301661532591238),
 (27, 0.040301527231785719),
 (29, 0.040301204599197553),
 (11, 0.04028797501358073),
 (12, 0.040287974734594126),
 (6, 0.040287946553964976),
 (9, 0.040287912750353018),
 (7, 0.040287753191403798),
 (8, 0.040287706244392915),
 (17, 0.013593030237955836),
 (21, 0.013588769698228727),
 (20, 0.013587339766063083),
 (15, 0.01358478852007482),
 (16, 0.013582499220089021),
 (19, 0.013580501238146782),
 (13, 0.013576181130293789),
 (18, 0.013573994270470232),
 (2, 0.013563332413300339),
 (4, 0.013562768371730084),
 (3, 0.01356204716374014),
 (1, 0.013559185286415457),
 (30, 0.013553029488594355),
 (31, 0.013552616444443772)]

#### get_topic_terms로 나온 단어의 수치는 모두 더하면 1. 보정이 들어간걸 아닐까 짐작...

In [19]:
ldamodel.print_topic(0)

'0.068*"mother" + 0.068*"brother" + 0.068*"drive" + 0.041*"pressur" + 0.040*"well" + 0.040*"never" + 0.040*"feel" + 0.040*"often" + 0.040*"seem" + 0.040*"better"'

In [20]:
ldamodel[corpus[0]]

[(0, 0.062831962215687626), (1, 0.93716803778431235)]

In [21]:
print(corpus[0])
print(texts[0])

[(0, 1), (1, 1), (2, 2), (3, 2), (4, 2), (5, 1)]
['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [22]:
[ldamodel[c] for c in corpus]

[[(0, 0.062834443472397708), (1, 0.93716555652760225)],
 [(0, 0.94614609418810625), (1, 0.05385390581189374)],
 [(0, 0.053863133580453963), (1, 0.94613686641954609)],
 [(0, 0.95798153498980432), (1, 0.042018465010195627)],
 [(0, 0.07325345356753199), (1, 0.92674654643246801)]]

#### 얘네들도 다 더하면 1...

음..

임의의 새로운 문장으로 테스트를 해보고 싶다...

In [23]:
test = 'i like eat brocolli'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.1285381782102015), (1, 0.87146182178979847)]

In [24]:
ldamodel.print_topic(1)

'0.086*"good" + 0.086*"brocolli" + 0.086*"health" + 0.061*"eat" + 0.037*"profession" + 0.037*"say" + 0.037*"like" + 0.037*"expert" + 0.037*"suggest" + 0.037*"may"'

In [25]:
for s in stemmed_tokens:
    print(ldamodel.get_term_topics(s))

[(1, 0.025621179586479078)]
[(1, 0.049989804903969412)]
[(1, 0.074625352654320784)]


In [26]:
test = 'the quick brown fox jumps over the lazy dog choker'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.5), (1, 0.5)]

In [27]:
dictionary.doc2bow(stemmed_tokens)

[]

기존의 사전에 test 문장을 함께 넣어서 돌려볼 순 없을까?

In [28]:
add_corpus = dictionary.doc2bow(stemmed_tokens, True)

In [29]:
add_corpus

[(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]

In [30]:
len(dictionary.items())

39

In [31]:
[d for d in dictionary.items()]

[(14, 'pressur'),
 (34, 'dog'),
 (2, 'brocolli'),
 (26, 'never'),
 (10, 'drive'),
 (4, 'eat'),
 (27, 'perform'),
 (36, 'quick'),
 (35, 'jump'),
 (5, 'mother'),
 (31, 'profession'),
 (18, 'health'),
 (11, 'practic'),
 (22, 'well'),
 (24, 'seem'),
 (12, 'lot'),
 (19, 'suggest'),
 (30, 'say'),
 (23, 'better'),
 (16, 'may'),
 (8, 'time'),
 (7, 'spend'),
 (20, 'increas'),
 (25, 'feel'),
 (37, 'lazi'),
 (1, 'like'),
 (28, 'often'),
 (0, 'brother'),
 (13, 'expert'),
 (32, 'brown'),
 (15, 'caus'),
 (33, 'fox'),
 (6, 'around'),
 (9, 'basebal'),
 (3, 'good'),
 (17, 'blood'),
 (38, 'choker'),
 (29, 'school'),
 (21, 'tension')]

In [32]:
corpus.append(dictionary.doc2bow(stemmed_tokens))

In [33]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [34]:
corpus[-1]

[(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]

In [35]:
corpus

[[(0, 1), (1, 1), (2, 2), (3, 2), (4, 2), (5, 1)],
 [(0, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(10, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(0, 1),
  (5, 1),
  (10, 1),
  (14, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(2, 1), (3, 1), (18, 2), (30, 1), (31, 1)],
 [(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]]

In [36]:
ldamodel[corpus[5]]

[(0, 0.064101678032143533), (1, 0.93589832196785649)]

In [38]:
[ldamodel[c] for c in corpus]

[[(0, 0.94541390561280314), (1, 0.054586094387196749)],
 [(0, 0.059740091494544054), (1, 0.94025990850545593)],
 [(0, 0.95056810279616821), (1, 0.049431897203831766)],
 [(0, 0.95625637164399246), (1, 0.043743628356007563)],
 [(0, 0.92579987839057964), (1, 0.074200121609420272)],
 [(0, 0.064101304550023525), (1, 0.9358986954499765)]]