In [1]:
!pip install gensim



In [1]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school,\
but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]

In [2]:
from nltk.corpus import stopwords
import string
from string import punctuation
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

In [3]:
def clean_txt(sent):
    stemmer_s = SnowballStemmer("english")
    tokens = word_tokenize(sent.lower().replace('[^a-z ]',''))
    stop_updated = stopwords.words("english") + list(punctuation) + ["..."] + \
    ["would", "could","told","subject"]
    text = [stemmer_s.stem(term) for term in tokens if term not in stop_updated and len(term) > 2] 
    res = " ".join(text)
    return res
doc_clean = [clean_txt(doc).split() for doc in doc_complete]

In [4]:
# 2d list of words in the entire corpus
print(doc_clean)

[['sugar', 'bad', 'consum', 'sister', 'like', 'sugar', 'father'], ['father', 'spend', 'lot', 'time', 'drive', 'sister', 'around', 'danc', 'practic'], ['doctor', 'suggest', 'drive', 'may', 'caus', 'increas', 'stress', 'blood', 'pressur'], ['sometim', 'feel', 'pressur', 'perform', 'well', 'school', 'father', 'never', 'seem', 'drive', 'sister', 'better'], ['health', 'expert', 'say', 'sugar', 'good', 'lifestyl']]


In [5]:
# Importing Gensim
import gensim
from gensim import corpora

In [6]:
# Creating the term dictionary of our courpus, 
#where every unique term is assigned an index. 

dictionary = corpora.Dictionary(doc_clean)

In [7]:
#Get the BOW
print(list(dictionary.items()))

[(0, 'bad'), (1, 'consum'), (2, 'father'), (3, 'like'), (4, 'sister'), (5, 'sugar'), (6, 'around'), (7, 'danc'), (8, 'drive'), (9, 'lot'), (10, 'practic'), (11, 'spend'), (12, 'time'), (13, 'blood'), (14, 'caus'), (15, 'doctor'), (16, 'increas'), (17, 'may'), (18, 'pressur'), (19, 'stress'), (20, 'suggest'), (21, 'better'), (22, 'feel'), (23, 'never'), (24, 'perform'), (25, 'school'), (26, 'seem'), (27, 'sometim'), (28, 'well'), (29, 'expert'), (30, 'good'), (31, 'health'), (32, 'lifestyl'), (33, 'say')]


In [8]:
# Converting list of documents (corpus) 
#into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [9]:
print(doc_term_matrix)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)], [(2, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(2, 1), (4, 1), (8, 1), (18, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)], [(5, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)]]


In [10]:
[[(dictionary[id],freq) for id,freq in value] \
 for value in doc_term_matrix[:3]]

[[('bad', 1),
  ('consum', 1),
  ('father', 1),
  ('like', 1),
  ('sister', 1),
  ('sugar', 2)],
 [('father', 1),
  ('sister', 1),
  ('around', 1),
  ('danc', 1),
  ('drive', 1),
  ('lot', 1),
  ('practic', 1),
  ('spend', 1),
  ('time', 1)],
 [('drive', 1),
  ('blood', 1),
  ('caus', 1),
  ('doctor', 1),
  ('increas', 1),
  ('may', 1),
  ('pressur', 1),
  ('stress', 1),
  ('suggest', 1)]]

In [11]:
#importing the LDA model
from gensim.models.ldamodel import LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = LdaModel(doc_term_matrix, num_topics=2, id2word = dictionary,
                    passes=50, 
                    random_state=42)

In [12]:
ldamodel.print_topics(num_words=50)

[(0,
  '0.083*"sugar" + 0.060*"sister" + 0.060*"father" + 0.036*"seem" + 0.036*"well" + 0.036*"sometim" + 0.036*"school" + 0.036*"perform" + 0.036*"never" + 0.036*"feel" + 0.036*"better" + 0.036*"bad" + 0.036*"consum" + 0.036*"like" + 0.036*"good" + 0.036*"expert" + 0.036*"say" + 0.036*"lifestyl" + 0.036*"health" + 0.036*"pressur" + 0.035*"drive" + 0.012*"time" + 0.012*"spend" + 0.012*"danc" + 0.012*"practic" + 0.012*"around" + 0.012*"lot" + 0.012*"suggest" + 0.012*"may" + 0.012*"caus" + 0.012*"doctor" + 0.012*"blood" + 0.012*"increas" + 0.012*"stress"'),
 (1,
  '0.072*"drive" + 0.043*"pressur" + 0.043*"increas" + 0.043*"may" + 0.043*"doctor" + 0.043*"caus" + 0.043*"blood" + 0.043*"stress" + 0.043*"suggest" + 0.043*"around" + 0.043*"practic" + 0.043*"spend" + 0.043*"danc" + 0.043*"lot" + 0.043*"time" + 0.042*"father" + 0.042*"sister" + 0.014*"good" + 0.014*"health" + 0.014*"expert" + 0.014*"say" + 0.014*"lifestyl" + 0.014*"sugar" + 0.014*"consum" + 0.014*"bad" + 0.014*"like" + 0.014*"b

In [13]:
topn = [topic for topic in ldamodel.get_topic_terms(topicid=0, topn=20)]
topn

[(5, 0.08331017),
 (4, 0.059885785),
 (2, 0.059885763),
 (26, 0.035673775),
 (28, 0.03567377),
 (22, 0.03567377),
 (23, 0.03567377),
 (24, 0.03567377),
 (25, 0.03567377),
 (27, 0.03567377),
 (21, 0.035673767),
 (3, 0.035650667),
 (0, 0.035650667),
 (1, 0.035650667),
 (29, 0.035644885),
 (30, 0.035644885),
 (32, 0.035644885),
 (33, 0.035644885),
 (31, 0.03564488),
 (18, 0.03556915)]

In [14]:
[(dictionary[id],prob) for id,prob in topn]

[('sugar', 0.08331017),
 ('sister', 0.059885785),
 ('father', 0.059885763),
 ('seem', 0.035673775),
 ('well', 0.03567377),
 ('feel', 0.03567377),
 ('never', 0.03567377),
 ('perform', 0.03567377),
 ('school', 0.03567377),
 ('sometim', 0.03567377),
 ('better', 0.035673767),
 ('like', 0.035650667),
 ('bad', 0.035650667),
 ('consum', 0.035650667),
 ('expert', 0.035644885),
 ('good', 0.035644885),
 ('lifestyl', 0.035644885),
 ('say', 0.035644885),
 ('health', 0.03564488),
 ('pressur', 0.03556915)]

In [15]:
#To get the topic distribution in each document
ldamodel.get_document_topics(doc_term_matrix[4])

[(0, 0.9252513), (1, 0.07474871)]