In [7]:
# Topic modeling using gensim and LDA.

doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." 
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]
doc_clean = [doc.split() for doc in doc_complete]


In [16]:
from gensim import corpora
from pprint import pprint

# Converting list of documents (corpus) into Document 
# Term Matrix using dictionary prepared above.

# Converting into dictionary
dictionary = corpora.Dictionary(doc_clean)
# pprint(dictionary.token2id)

# Converting the documents to bag of words representation.
corpus = [dictionary.doc2bow(text) for text in doc_clean]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2)], [(0, 1), (9, 1), (11, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(17, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)]]


In [17]:
from gensim import models

lda = models.ldamodel.LdaModel


# Training LDA model on the corpus (Document Matrix)
ldamodel = lda(corpus, num_topics=3, id2word=dictionary, passes=50)
print(ldamodel.print_topics())

[(0, '0.053*"driving" + 0.053*"My" + 0.053*"sister" + 0.053*"my" + 0.053*"to" + 0.053*"around" + 0.053*"lot" + 0.053*"father" + 0.053*"time" + 0.053*"spends"'), (1, '0.029*"driving" + 0.029*"My" + 0.029*"my" + 0.029*"sister" + 0.029*"to" + 0.029*"that" + 0.029*"increased" + 0.029*"cause" + 0.029*"may" + 0.029*"stress"'), (2, '0.063*"to" + 0.036*"likes" + 0.036*"is" + 0.036*"bad" + 0.036*"but" + 0.036*"sugar," + 0.036*"not" + 0.036*"consume." + 0.036*"Sugar" + 0.036*"have"')]


In [18]:
# Term Frequecny - Inverse Document Frequency (TF-IDF)
# This is a statistical feature.

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

corpus =  ['This is sample document', 
           'another random document', 
           'third sample document text']

X = tfidf.fit_transform(corpus)
print(X)

  (0, 7)	0.5844829010200651
  (0, 2)	0.5844829010200651
  (0, 4)	0.444514311537431
  (0, 1)	0.34520501686496574
  (1, 1)	0.3853716274664007
  (1, 0)	0.652490884512534
  (1, 3)	0.652490884512534
  (2, 4)	0.444514311537431
  (2, 1)	0.34520501686496574
  (2, 6)	0.5844829010200651
  (2, 5)	0.5844829010200651


In [None]:
# Other statistical features include trivial features such as word count,
# sentence count, punctuation count etc. these features may seem trivial
# but can have a significant impact on the model. Other measures include
# readibility measures such as syllable counts, smog index and flesch
# reading ease. TextStat library is usefull for such features.