**Topic modeling** generally refers to a collection of unsupervised statistical learning methods to discover latent topics in a large collection of text documents.

Some of the popular topic modeling algorithms are latent Dirichlet allocation (LDA), latent semantic analysis (LSA), and probabilistic latent semantic analysis (PLSA). In practice, the technique that’s most commonly used is LDA.

> TIP: Removing words with low frequency or keeping only those words that are nouns and verbs are some ways of improving a topic model. If the corpus is big, divide it into batches of fixed sizes and run topic modeling for each batch. The best output comes from the intersection of topics from each batch.

Use case for topic models are:
- Summarizing documents, tweets, etc., in the form of keywords based on learned topic distributions
- Detecting social media trends over a period of time
- Designing recommender systems for text

> NOTE: LDA typically work only with long documents and perform poorly on short documents, such as a corpus of tweets.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint

[nltk_data] Downloading package stopwords to C:\Users\Yasir Abdur
[nltk_data]     Rohman\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
#tokenize, remove stopwords, non-alphabetic words, lowercase
def preprocess(textstring):
    stops =  set(stopwords.words('english'))
    tokens = word_tokenize(textstring)
    return [token.lower() for token in tokens if token.isalpha() and token not in stops]

data_path = "Data/booksummaries/booksummaries.txt"
summaries = []
for line in open(data_path, encoding="utf-8"):
    temp = line.split("\t")
    summaries.append(preprocess(temp[6]))

# Create a dictionary representation of the documents.
dictionary = Dictionary(summaries)
# Filter infrequent or too frequent words.
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(summary) for summary in summaries]
# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
#Train the topic model
model = LdaModel(corpus=corpus, id2word=id2word,iterations=400, num_topics=10)
top_topics = list(model.top_topics(corpus))
pprint(top_topics)

[([(0.007099087, 'he'),
   (0.006641345, 'she'),
   (0.0049030785, 'back'),
   (0.004803687, 'one'),
   (0.004622856, 'tells'),
   (0.0046049664, 'they'),
   (0.004538162, 'house'),
   (0.004194234, 'mother'),
   (0.0041599944, 'go'),
   (0.0039790394, 'find'),
   (0.0038036027, 'when'),
   (0.0037479524, 'get'),
   (0.0037207666, 'father'),
   (0.0036563275, 'two'),
   (0.003543434, 'home'),
   (0.0034573951, 'after'),
   (0.0033719945, 'time'),
   (0.003272797, 'day'),
   (0.0032547042, 'finds'),
   (0.003218834, 'family')],
  -0.8686722811252202),
 ([(0.007681313, 'life'),
   (0.0068563065, 'he'),
   (0.006769424, 'novel'),
   (0.0052444045, 'book'),
   (0.005243887, 'family'),
   (0.0048457133, 'story'),
   (0.004737223, 'one'),
   (0.0046171625, 'love'),
   (0.0043488615, 'in'),
   (0.0042025405, 'also'),
   (0.004072716, 'new'),
   (0.0038654734, 'young'),
   (0.0037804728, 'first'),
   (0.0037744278, 'father'),
   (0.0035221083, 'two'),
   (0.0032744429, 'becomes'),
   (0.003078

In [3]:
for idx in range(10):
    print("Topic #%s:" % idx, model.print_topic(idx, 10))
print("=" * 20)

Topic #0: 0.007*"paul" + 0.007*"richard" + 0.007*"he" + 0.005*"father" + 0.005*"one" + 0.004*"new" + 0.004*"book" + 0.004*"mother" + 0.004*"story" + 0.004*"matt"
Topic #1: 0.007*"he" + 0.007*"she" + 0.005*"back" + 0.005*"one" + 0.005*"tells" + 0.005*"they" + 0.005*"house" + 0.004*"mother" + 0.004*"go" + 0.004*"find"
Topic #2: 0.006*"love" + 0.005*"he" + 0.005*"arthur" + 0.005*"vlad" + 0.005*"chris" + 0.005*"one" + 0.004*"hugh" + 0.004*"she" + 0.004*"in" + 0.004*"peter"
Topic #3: 0.007*"war" + 0.005*"world" + 0.004*"new" + 0.004*"ship" + 0.004*"one" + 0.004*"in" + 0.003*"also" + 0.003*"time" + 0.003*"novel" + 0.003*"book"
Topic #4: 0.007*"jacky" + 0.006*"one" + 0.006*"book" + 0.004*"max" + 0.004*"he" + 0.004*"in" + 0.004*"also" + 0.003*"new" + 0.003*"people" + 0.003*"time"
Topic #5: 0.007*"he" + 0.006*"one" + 0.005*"back" + 0.004*"world" + 0.004*"she" + 0.004*"find" + 0.003*"they" + 0.003*"also" + 0.003*"after" + 0.003*"in"
Topic #6: 0.007*"earth" + 0.006*"one" + 0.006*"he" + 0.005*"pla

In [5]:
from gensim.models import LsiModel
lsamodel = LsiModel(corpus, num_topics=10, id2word = id2word)  # train model

pprint(lsamodel.print_topics(num_topics=10, num_words=10))

[(0,
  '0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + '
  '0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"'),
 (1,
  '-0.493*"tom" + -0.226*"sophia" + -0.182*"mrs" + -0.178*"house" + '
  '-0.161*"she" + -0.154*"father" + -0.147*"mr" + -0.146*"he" + -0.138*"tells" '
  '+ 0.126*"one"'),
 (2,
  '-0.558*"tom" + -0.252*"sophia" + 0.213*"she" + 0.191*"he" + -0.185*"mrs" + '
  '0.163*"tells" + 0.143*"mother" + -0.136*"mr" + -0.130*"western" + '
  '-0.103*"however"'),
 (3,
  '-0.233*"they" + -0.203*"ship" + 0.186*"he" + -0.183*"david" + -0.182*"back" '
  '+ -0.163*"tells" + 0.161*"life" + 0.161*"family" + -0.155*"find" + '
  '0.154*"narrator"'),
 (4,
  '0.663*"he" + -0.257*"mother" + -0.213*"she" + -0.195*"father" + '
  '-0.181*"family" + 0.121*"narrator" + 0.120*"monk" + -0.098*"novel" + '
  '-0.097*"school" + -0.095*"children"'),
 (5,
  '0.486*"david" + -0.244*"king" + 0.169*"rosa" + 0.163*"book" + '
  '0.125*"harlan" + -0.121*"he" + 0.113*"she

In [6]:
for idx in range(10):
    print("Topic #%s:" % idx, lsamodel.print_topic(idx, 10))
print("=" * 20)

Topic #0: 0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + 0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"
Topic #1: -0.493*"tom" + -0.226*"sophia" + -0.182*"mrs" + -0.178*"house" + -0.161*"she" + -0.154*"father" + -0.147*"mr" + -0.146*"he" + -0.138*"tells" + 0.126*"one"
Topic #2: -0.558*"tom" + -0.252*"sophia" + 0.213*"she" + 0.191*"he" + -0.185*"mrs" + 0.163*"tells" + 0.143*"mother" + -0.136*"mr" + -0.130*"western" + -0.103*"however"
Topic #3: -0.233*"they" + -0.203*"ship" + 0.186*"he" + -0.183*"david" + -0.182*"back" + -0.163*"tells" + 0.161*"life" + 0.161*"family" + -0.155*"find" + 0.154*"narrator"
Topic #4: 0.663*"he" + -0.257*"mother" + -0.213*"she" + -0.195*"father" + -0.181*"family" + 0.121*"narrator" + 0.120*"monk" + -0.098*"novel" + -0.097*"school" + -0.095*"children"
Topic #5: 0.486*"david" + -0.244*"king" + 0.169*"rosa" + 0.163*"book" + 0.125*"harlan" + -0.121*"he" + 0.113*"she" + -0.111*"anita" + 0.107*"gould" + 0.104*"would"
To