In [1]:
# !pip install gensim

or

In [2]:
# !conda install -c anaconda gensim

In [3]:
# !pip install wikipedia

In [4]:
# !pip install pyLDAvis

# Topic Modeling with LDA

## Scraping Wikipedia Articles

In [5]:
import wikipedia
import nltk

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/cst/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
donald_trump = wikipedia.page("Donald Trump")
artificial_intelligence = wikipedia.page("Artificial Intelligence")
mona_lisa = wikipedia.page("Mona Lisa")
eiffel_tower = wikipedia.page("Eiffel Tower")

corpus = [donald_trump.content, artificial_intelligence.content, mona_lisa.content, eiffel_tower.content]

## Data Preprocessing

In [7]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word)  > 5]

        return tokens

## Modeling Topics

In [8]:
processed_data = [];
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

In [9]:
processed_data

[['donald',
  'current',
  'president',
  'united',
  'entering',
  'politics',
  'businessman',
  'television',
  'personality',
  'raised',
  'borough',
  'attended',
  'fordham',
  'university',
  'received',
  'bachelor',
  'degree',
  'economics',
  'wharton',
  'school',
  'university',
  'pennsylvania',
  'became',
  'president',
  'father',
  'estate',
  'business',
  'renamed',
  'organization',
  'expanded',
  'operation',
  'building',
  'renovating',
  'skyscraper',
  'casino',
  'course',
  'started',
  'various',
  'venture',
  'mostly',
  'licensing',
  'business',
  'involved',
  'federal',
  'action',
  'including',
  'bankruptcy',
  'universe',
  'beauty',
  'pageant',
  'produced',
  'hosted',
  'apprentice',
  'reality',
  'television',
  'series',
  'forbes',
  'estimated',
  'billion',
  'political',
  'position',
  'described',
  'populist',
  'protectionist',
  'nationalist',
  'entered',
  'presidential',
  'republican',
  'elected',
  'surprise',
  'victory',


In [10]:
from gensim import corpora

gensim_dictionary = corpora.Dictionary(processed_data)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in processed_data]

In [11]:
gensim_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 4),
  (13, 4),
  (14, 2),
  (15, 4),
  (16, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 20),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 8),
  (30, 5),
  (31, 1),
  (32, 5),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 4),
  (37, 3),
  (38, 24),
  (39, 2),
  (40, 1),
  (41, 4),
  (42, 2),
  (43, 1),
  (44, 3),
  (45, 1),
  (46, 2),
  (47, 1),
  (48, 3),
  (49, 1),
  (50, 2),
  (51, 2),
  (52, 2),
  (53, 2),
  (54, 1),
  (55, 61),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 2),
  (60, 1),
  (61, 1),
  (62, 3),
  (63, 1),
  (64, 1),
  (65, 4),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 2),
  (70, 1),
  (71, 2),
  (72, 2),
  (73, 2),
  (74, 6),
  (75, 1),
  (76, 1),
  (77, 4),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 3),
  (85, 3),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 9),
  (91,

In [12]:
import pickle

pickle.dump(gensim_corpus, open('data/gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('data/gensim_dictionary.gensim')

In [13]:
import gensim

lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('data/gensim_model.gensim')

In [14]:
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.014*"intelligence" + 0.014*"painting" + 0.011*"machine" + 0.010*"artificial" + 0.008*"problem" + 0.008*"system" + 0.007*"leonardo" + 0.006*"research" + 0.006*"knowledge" + 0.005*"approach"')
(1, '0.009*"president" + 0.008*"campaign" + 0.007*"administration" + 0.005*"presidential" + 0.005*"government" + 0.005*"election" + 0.005*"million" + 0.005*"united" + 0.004*"political" + 0.004*"policy"')
(2, '0.000*"intelligence" + 0.000*"machine" + 0.000*"president" + 0.000*"campaign" + 0.000*"system" + 0.000*"artificial" + 0.000*"painting" + 0.000*"eiffel" + 0.000*"problem" + 0.000*"administration"')
(3, '0.024*"eiffel" + 0.007*"second" + 0.005*"french" + 0.005*"exposition" + 0.005*"structure" + 0.005*"tallest" + 0.004*"engineer" + 0.004*"design" + 0.004*"france" + 0.004*"restaurant"')


In [15]:
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=8, id2word=gensim_dictionary, passes=15)
lda_model.save('data/gensim_model.gensim')
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.000*"painting" + 0.000*"intelligence" + 0.000*"eiffel" + 0.000*"system" + 0.000*"president"')
(1, '0.000*"intelligence" + 0.000*"painting" + 0.000*"million" + 0.000*"president" + 0.000*"campaign"')
(2, '0.000*"intelligence" + 0.000*"machine" + 0.000*"system" + 0.000*"president" + 0.000*"artificial"')
(3, '0.042*"painting" + 0.020*"leonardo" + 0.011*"portrait" + 0.011*"louvre" + 0.008*"century"')
(4, '0.009*"intelligence" + 0.006*"president" + 0.006*"machine" + 0.006*"campaign" + 0.006*"artificial"')
(5, '0.000*"eiffel" + 0.000*"painting" + 0.000*"intelligence" + 0.000*"million" + 0.000*"second"')
(6, '0.030*"eiffel" + 0.009*"second" + 0.007*"french" + 0.007*"exposition" + 0.007*"structure"')
(7, '0.000*"intelligence" + 0.000*"president" + 0.000*"eiffel" + 0.000*"campaign" + 0.000*"administration"')


In [16]:
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('data/gensim_model.gensim')
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.034*"painting" + 0.016*"leonardo" + 0.009*"louvre" + 0.009*"portrait" + 0.006*"century" + 0.006*"french" + 0.006*"museum" + 0.005*"giocondo" + 0.005*"italian" + 0.004*"subject"')
(1, '0.013*"intelligence" + 0.011*"eiffel" + 0.010*"machine" + 0.009*"artificial" + 0.008*"system" + 0.008*"problem" + 0.006*"research" + 0.006*"knowledge" + 0.005*"computer" + 0.005*"approach"')
(2, '0.009*"president" + 0.008*"campaign" + 0.007*"administration" + 0.005*"presidential" + 0.005*"government" + 0.005*"election" + 0.005*"million" + 0.005*"united" + 0.004*"policy" + 0.004*"political"')
(3, '0.000*"campaign" + 0.000*"president" + 0.000*"administration" + 0.000*"election" + 0.000*"million" + 0.000*"intelligence" + 0.000*"machine" + 0.000*"painting" + 0.000*"united" + 0.000*"general"')


## Evaluating the LDA Model

In [17]:
test_doc = 'Great structures are build to remember an event happened in the history.'
test_doc = preprocess_text(test_doc)
bow_test_doc = gensim_dictionary.doc2bow(test_doc)

print(lda_model.get_document_topics(bow_test_doc))

[(0, 0.05193801), (1, 0.2912223), (2, 0.6064503), (3, 0.050389335)]


In [18]:
print('\nPerplexity:', lda_model.log_perplexity(gensim_corpus))

from gensim.models import CoherenceModel

coherence_score_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=gensim_dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()

print('\nCoherence Score:', coherence_score)


Perplexity: -7.897696658405584

Coherence Score: 0.5864403540360461


## Visualizing the LDA

In [19]:
gensim_dictionary = gensim.corpora.Dictionary.load('data/gensim_dictionary.gensim')
gensim_corpus = pickle.load(open('data/gensim_corpus_corpus.pkl', 'rb'))
lda_model = gensim.models.ldamodel.LdaModel.load('data/gensim_model.gensim')

import pyLDAvis.gensim

lda_visualization = pyLDAvis.gensim.prepare(lda_model, gensim_corpus, gensim_dictionary, sort_topics=False)
pyLDAvis.display(lda_visualization)

# Topic Modeling via LSI

In [20]:
from gensim.models import LsiModel

lsi_model = LsiModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary)
topics = lsi_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.255*"president" + 0.232*"campaign" + 0.201*"administration" + 0.150*"government" + 0.149*"presidential" + 0.149*"million" + 0.146*"election" + 0.134*"united" + 0.126*"policy" + 0.123*"political"')
(1, '0.420*"intelligence" + 0.332*"machine" + 0.293*"artificial" + 0.243*"problem" + 0.227*"system" + 0.178*"knowledge" + 0.172*"research" + 0.140*"computer" + 0.139*"approach" + 0.138*"learning"')
(2, '-0.703*"painting" + -0.332*"leonardo" + -0.183*"louvre" + -0.181*"portrait" + -0.158*"eiffel" + -0.150*"french" + -0.139*"century" + -0.122*"museum" + -0.094*"italian" + -0.094*"giocondo"')
(3, '-0.670*"eiffel" + 0.236*"painting" + -0.179*"second" + -0.148*"exposition" + -0.147*"structure" + -0.130*"tallest" + -0.119*"engineer" + 0.116*"leonardo" + -0.109*"design" + -0.107*"french"')
