### topic modeling - LDA

In [100]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import gensim, nltk, re
from gensim import corpora,models

In [101]:
# This small corpus is only for getting familiar with the model
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [102]:
# clean the text
texts = []
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    raw = re.sub(r"[^\w\s]", "", raw)
    tokens = nltk.word_tokenize(raw)

    # remove stop words from tokens
    stop = stopwords.words("english")
    stopped_tokens = [i for i in tokens if not i in stop]

    # add tokens to list
    texts.append(stopped_tokens)

In [104]:
# how it looks like
texts[0]

['brocolli',
 'good',
 'eat',
 'brother',
 'likes',
 'eat',
 'good',
 'brocolli',
 'mother']

In [105]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1a1758f978>

In [124]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [125]:
# the corpus shows what terms appear how many terms in each document <wordid, frequency>
corpus

[[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)],
 [(1, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(1, 1),
  (5, 1),
  (19, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1)],
 [(0, 1), (3, 1), (16, 2), (31, 1), (32, 1)]]

In [126]:
# see what the wordid represents
dictionary[0]

'brocolli'

In [128]:
# Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus]

[[('brocolli', 2),
  ('brother', 1),
  ('eat', 2),
  ('good', 2),
  ('likes', 1),
  ('mother', 1)],
 [('brother', 1),
  ('mother', 1),
  ('around', 1),
  ('baseball', 1),
  ('driving', 1),
  ('lot', 1),
  ('practice', 1),
  ('spends', 1),
  ('time', 1)],
 [('driving', 1),
  ('blood', 1),
  ('cause', 1),
  ('experts', 1),
  ('health', 1),
  ('increased', 1),
  ('may', 1),
  ('pressure', 1),
  ('suggest', 1),
  ('tension', 1)],
 [('brother', 1),
  ('mother', 1),
  ('pressure', 1),
  ('better', 1),
  ('drive', 1),
  ('feel', 1),
  ('never', 1),
  ('often', 1),
  ('perform', 1),
  ('school', 1),
  ('seems', 1),
  ('well', 1)],
 [('brocolli', 1),
  ('good', 1),
  ('health', 2),
  ('professionals', 1),
  ('say', 1)]]

In [80]:
# use "num_topics" to specify how many topics to generate
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

In [81]:
# see the three topics and how words make contributions
print(ldamodel.print_topics(num_topics=3, num_words=3))

# The code below prints the contributions(weight) of all the words
# for i in ldamodel.show_topics(formatted=False,num_topics=ldamodel.num_topics,num_words=len(ldamodel.id2word)):
    # print ("{}\n".format(i))

[(0, '0.073*"good" + 0.073*"brocolli" + 0.073*"brother"'), (1, '0.067*"driving" + 0.067*"mother" + 0.067*"baseball"'), (2, '0.124*"health" + 0.049*"pressure" + 0.049*"increased"')]


In [112]:
# we can also see the topic probability distribution of each document
for i in range(0,len(corpus)):
    print("The topic probability distribution of document {} is {}".format(i,ldamodel[corpus[i]]))

The topic probability distribution of document 0 is [(0, 0.9309951), (1, 0.034331925), (2, 0.034673043)]
The topic probability distribution of document 1 is [(0, 0.0349706), (1, 0.9311617), (2, 0.033867665)]
The topic probability distribution of document 2 is [(0, 0.030807817), (1, 0.031153841), (2, 0.9380383)]
The topic probability distribution of document 3 is [(0, 0.94744784), (1, 0.026369436), (2, 0.026182756)]
The topic probability distribution of document 4 is [(0, 0.053381566), (1, 0.047964066), (2, 0.8986544)]


In [98]:
# we can also infer a new document to see which topic it is in
doc = ['brocolli', 'good', 'eat']
bow = ldamodel.id2word.doc2bow(doc)
topic_analysis = ldamodel[bow]

In [99]:
# the probability of each topic that the document belongs to
topic_analysis

[(0, 0.82752615), (1, 0.08377978), (2, 0.088694096)]

References: <br>
https://stackoverflow.com/questions/45310925/how-to-get-a-complete-topic-distribution-for-a-document-using-gensim-lda <br>
https://svn.spraakdata.gu.se/repos/richard/pub/statnlp2016_web/vgassignment1.html<br>
https://radimrehurek.com/gensim/models/ldamodel.html<br>
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/