# LDA (Latent Dirichlet Allocation)
주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지에 대한 확률 모형

## Latent Dirichlet Allocation (LDA) with Python ... review
https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

In [1]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []




In [2]:
# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [3]:
print(doc_a)
print(texts[0])

Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.
['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [4]:
print(doc_b)
print(texts[1])

My mother spends a lot of time driving my brother around to baseball practice.
['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic']


In [5]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

corpora.Dictionary를 하면 어떻게 될까?

결과가 궁금하잖아요?

### corpora.dictionary – Construct word<->id mappings
https://radimrehurek.com/gensim/corpora/dictionary.html

In [6]:
[d for d in dictionary.items()]

[(18, 'pressur'),
 (22, 'feel'),
 (21, 'blood'),
 (5, 'brother'),
 (10, 'drive'),
 (11, 'lot'),
 (27, 'better'),
 (13, 'increas'),
 (2, 'like'),
 (15, 'caus'),
 (24, 'school'),
 (3, 'eat'),
 (25, 'often'),
 (12, 'basebal'),
 (23, 'seem'),
 (9, 'time'),
 (6, 'spend'),
 (0, 'brocolli'),
 (30, 'profession'),
 (8, 'practic'),
 (1, 'good'),
 (20, 'expert'),
 (4, 'mother'),
 (29, 'never'),
 (14, 'may'),
 (7, 'around'),
 (31, 'say'),
 (16, 'health'),
 (17, 'tension'),
 (19, 'suggest'),
 (26, 'perform'),
 (28, 'well')]

In [7]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [8]:
print(corpus[0])

[(0, 2), (1, 2), (2, 1), (3, 2), (4, 1), (5, 1)]


In [9]:
print(texts[0])

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [10]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [11]:
ldamodel.show_topics()

[(0,
  '0.068*"mother" + 0.068*"brother" + 0.068*"drive" + 0.041*"pressur" + 0.040*"never" + 0.040*"seem" + 0.040*"perform" + 0.040*"school" + 0.040*"better" + 0.040*"often"'),
 (1,
  '0.086*"brocolli" + 0.086*"good" + 0.086*"health" + 0.061*"eat" + 0.037*"may" + 0.037*"increas" + 0.037*"blood" + 0.037*"caus" + 0.037*"expert" + 0.037*"tension"')]

In [12]:
ldamodel.get_term_topics('drive')

[(0, 0.055900620338255325), (1, 0.025010456487683234)]

In [13]:
ldamodel.get_topic_terms(0)

[(4, 0.068411066180151456),
 (5, 0.068405040800852746),
 (10, 0.06807568097107275),
 (18, 0.040640562425953645),
 (29, 0.04032596401266815),
 (23, 0.040325730866286311),
 (26, 0.040324734416517356),
 (24, 0.040324653302638569),
 (27, 0.04032437787965612),
 (25, 0.040324174723175232)]

In [14]:
len(dictionary.items())

32

In [15]:
ldamodel.get_topic_terms(0, 32)

[(4, 0.068411066180151456),
 (5, 0.068405040800852746),
 (10, 0.06807568097107275),
 (18, 0.040640562425953645),
 (29, 0.04032596401266815),
 (23, 0.040325730866286311),
 (26, 0.040324734416517356),
 (24, 0.040324653302638569),
 (27, 0.04032437787965612),
 (25, 0.040324174723175232),
 (28, 0.040323967166280165),
 (22, 0.040323363094020445),
 (12, 0.040309707599858045),
 (8, 0.040307914461435647),
 (9, 0.040307631029270111),
 (7, 0.040306581992733986),
 (11, 0.040306544498672062),
 (6, 0.040305256940337812),
 (16, 0.013635893942367526),
 (31, 0.01361672638883236),
 (30, 0.013613061933776607),
 (1, 0.013593043439626813),
 (0, 0.013591577902299345),
 (3, 0.013573538292039434),
 (2, 0.01356971664640136),
 (19, 0.013548335192016156),
 (17, 0.013547910342011504),
 (20, 0.013547699727420485),
 (15, 0.013547694680633594),
 (21, 0.013547550319250062),
 (13, 0.013547441695836911),
 (14, 0.01354685713590722)]

#### get_topic_terms로 나온 단어의 수치는 모두 더하면 1. 보정이 들어간걸 아닐까 짐작...

In [16]:
ldamodel.print_topic(0)

'0.068*"mother" + 0.068*"brother" + 0.068*"drive" + 0.041*"pressur" + 0.040*"never" + 0.040*"seem" + 0.040*"perform" + 0.040*"school" + 0.040*"better" + 0.040*"often"'

In [17]:
corpus[0]

[(0, 2), (1, 2), (2, 1), (3, 2), (4, 1), (5, 1)]

In [18]:
texts[0]

['brocolli',
 'good',
 'eat',
 'brother',
 'like',
 'eat',
 'good',
 'brocolli',
 'mother']

In [19]:
doc_a

'Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.'

In [20]:
ldamodel[corpus[0]]

[(0, 0.062843053584255973), (1, 0.9371569464157441)]

In [21]:
print(corpus[0])
print(texts[0])

[(0, 2), (1, 2), (2, 1), (3, 2), (4, 1), (5, 1)]
['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [22]:
[ldamodel[c] for c in corpus]

[[(0, 0.062843167000328812), (1, 0.93715683299967123)],
 [(0, 0.94612976564255313), (1, 0.05387023435744686)],
 [(0, 0.053740842332185186), (1, 0.94625915766781477)],
 [(0, 0.95797163976641564), (1, 0.042028360233584389)],
 [(0, 0.073274273848296476), (1, 0.92672572615170346)]]

#### 얘네들도 다 더하면 1...

음..

임의의 새로운 문장으로 테스트를 해보고 싶다...

In [23]:
test = 'i like eat brocolli'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.12853150564100865), (1, 0.87146849435899132)]

In [24]:
ldamodel.print_topic(1)

'0.086*"brocolli" + 0.086*"good" + 0.086*"health" + 0.061*"eat" + 0.037*"may" + 0.037*"increas" + 0.037*"blood" + 0.037*"caus" + 0.037*"expert" + 0.037*"tension"'

In [25]:
for s in stemmed_tokens:
    print(ldamodel.get_term_topics(s))

[(1, 0.025603595793383886)]
[(1, 0.049955999945680285)]
[(1, 0.074559363580739099)]


In [26]:
test = 'the quick brown fox jumps over the lazy dog choker'
raw = test.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
ldamodel[dictionary.doc2bow(stemmed_tokens)]

[(0, 0.5), (1, 0.5)]

In [27]:
stemmed_tokens

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'choker']

In [28]:
dictionary.doc2bow(stemmed_tokens)

[]

기존의 사전에 test 문장을 함께 넣어서 돌려볼 순 없을까?

In [29]:
dictionary.doc2bow(stemmed_tokens, True)

[(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]

In [30]:
len(dictionary.items())

39

In [31]:
[d for d in dictionary.items()]

[(18, 'pressur'),
 (22, 'feel'),
 (21, 'blood'),
 (36, 'quick'),
 (34, 'fox'),
 (5, 'brother'),
 (10, 'drive'),
 (11, 'lot'),
 (27, 'better'),
 (13, 'increas'),
 (2, 'like'),
 (15, 'caus'),
 (24, 'school'),
 (3, 'eat'),
 (25, 'often'),
 (12, 'basebal'),
 (37, 'choker'),
 (35, 'brown'),
 (23, 'seem'),
 (9, 'time'),
 (6, 'spend'),
 (0, 'brocolli'),
 (30, 'profession'),
 (8, 'practic'),
 (1, 'good'),
 (32, 'dog'),
 (38, 'jump'),
 (20, 'expert'),
 (4, 'mother'),
 (29, 'never'),
 (14, 'may'),
 (7, 'around'),
 (33, 'lazi'),
 (31, 'say'),
 (16, 'health'),
 (17, 'tension'),
 (19, 'suggest'),
 (26, 'perform'),
 (28, 'well')]

In [32]:
corpus.append(dictionary.doc2bow(stemmed_tokens))

In [33]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [34]:
corpus[-1]

[(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]

In [35]:
corpus

[[(0, 2), (1, 2), (2, 1), (3, 2), (4, 1), (5, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(10, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(4, 1),
  (5, 1),
  (10, 1),
  (18, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(0, 1), (1, 1), (16, 2), (30, 1), (31, 1)],
 [(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]]

In [36]:
ldamodel[corpus[5]]

[(0, 0.065637993327628777), (1, 0.93436200667237124)]

In [37]:
[ldamodel[c] for c in corpus]

[[(0, 0.9477329044937195), (1, 0.052267095506280524)],
 [(0, 0.074196756225930097), (1, 0.92580324377406986)],
 [(0, 0.050196127775037673), (1, 0.94980387222496221)],
 [(0, 0.95648686315369491), (1, 0.043513136846305041)],
 [(0, 0.37301360286521867), (1, 0.62698639713478133)],
 [(0, 0.065636419724926967), (1, 0.93436358027507305)]]