In [1]:
import wikipedia
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/lenovo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
en_stop = set(nltk.corpus.stopwords.words('english'))

global_warming = wikipedia.page("New York City")
artificial_intelligence = wikipedia.page("Artificial Intelligence")
mona_lisa = wikipedia.page("Manchester United")
eiffel_tower = wikipedia.page("Eiffel Tower")

corpus = [global_warming.content, artificial_intelligence.content, mona_lisa.content, eiffel_tower.content]

In [7]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word)  > 5]

        return tokens


In [8]:
processed_data = []
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

In [9]:
from gensim import corpora

gensim_dictionary = corpora.Dictionary(processed_data)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in processed_data]



In [10]:
import pickle

pickle.dump(gensim_corpus, open('gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('gensim_dictionary.gensim')

In [11]:
import gensim

lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')

In [15]:

topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.000*"united" + 0.000*"eiffel" + 0.000*"intelligence" + 0.000*"manhattan" + 0.000*"machine" + 0.000*"island" + 0.000*"largest" + 0.000*"system" + 0.000*"american" + 0.000*"million"')
(1, '0.000*"million" + 0.000*"manhattan" + 0.000*"largest" + 0.000*"united" + 0.000*"island" + 0.000*"intelligence" + 0.000*"machine" + 0.000*"american" + 0.000*"artificial" + 0.000*"manchester"')
(2, '0.000*"united" + 0.000*"intelligence" + 0.000*"machine" + 0.000*"eiffel" + 0.000*"artificial" + 0.000*"manhattan" + 0.000*"research" + 0.000*"system" + 0.000*"million" + 0.000*"league"')
(3, '0.035*"united" + 0.032*"manchester" + 0.031*"league" + 0.018*"season" + 0.015*"football" + 0.013*"million" + 0.009*"second" + 0.009*"became" + 0.008*"following" + 0.008*"european"')
(4, '0.024*"intelligence" + 0.019*"machine" + 0.017*"artificial" + 0.011*"problem" + 0.010*"research" + 0.008*"knowledge" + 0.007*"learning" + 0.006*"system" + 0.006*"approach" + 0.006*"researcher"')
(5, '0.031*"eiffel" + 0.009*"second

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=8, id2word=gensim_dictionary, passes=15)
lda_model.save('gensim_model.gensim')
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.000*"united" + 0.000*"eiffel" + 0.000*"intelligence" + 0.000*"manhattan" + 0.000*"machine"')
(1, '0.000*"million" + 0.000*"manhattan" + 0.000*"largest" + 0.000*"united" + 0.000*"island"')
(2, '0.000*"united" + 0.000*"intelligence" + 0.000*"machine" + 0.000*"eiffel" + 0.000*"artificial"')
(3, '0.035*"united" + 0.032*"manchester" + 0.031*"league" + 0.018*"season" + 0.015*"football"')
(4, '0.024*"intelligence" + 0.019*"machine" + 0.017*"artificial" + 0.011*"problem" + 0.010*"research"')
(5, '0.031*"eiffel" + 0.009*"second" + 0.007*"french" + 0.007*"structure" + 0.007*"exposition"')
(6, '0.015*"manhattan" + 0.013*"largest" + 0.011*"united" + 0.011*"island" + 0.011*"american"')
(7, '0.001*"united" + 0.000*"manhattan" + 0.000*"largest" + 0.000*"american" + 0.000*"million"')


In [16]:

lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.026*"eiffel" + 0.007*"second" + 0.006*"french" + 0.005*"structure" + 0.005*"exposition" + 0.005*"tallest" + 0.005*"engineer" + 0.004*"design" + 0.004*"million" + 0.004*"restaurant"')
(1, '0.021*"intelligence" + 0.017*"machine" + 0.015*"artificial" + 0.010*"problem" + 0.009*"research" + 0.007*"knowledge" + 0.006*"learning" + 0.005*"system" + 0.005*"approach" + 0.005*"computer"')
(2, '0.000*"united" + 0.000*"league" + 0.000*"largest" + 0.000*"million" + 0.000*"intelligence" + 0.000*"island" + 0.000*"machine" + 0.000*"manchester" + 0.000*"artificial" + 0.000*"manhattan"')
(3, '0.018*"united" + 0.011*"league" + 0.010*"manhattan" + 0.010*"manchester" + 0.009*"largest" + 0.008*"million" + 0.008*"american" + 0.007*"island" + 0.006*"season" + 0.006*"population"')


In [17]:
test_doc = 'Great structures are build to remember an event happened in the history.'
test_doc = preprocess_text(test_doc)
bow_test_doc = gensim_dictionary.doc2bow(test_doc)

print(lda_model.get_document_topics(bow_test_doc))

[(0, 0.73770416), (1, 0.0856213), (2, 0.083377935), (3, 0.093296565)]


In [18]:
print('\nPerplexity:', lda_model.log_perplexity(gensim_corpus))

from gensim.models import CoherenceModel

coherence_score_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=gensim_dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()

print('\nCoherence Score:', coherence_score)



Perplexity: -7.703129290953457

Coherence Score: 0.5038681161446206


In [20]:

gensim_dictionary = gensim.corpora.Dictionary.load('gensim_dictionary.gensim')
gensim_corpus = pickle.load(open('gensim_corpus_corpus.pkl', 'rb'))
lda_model = gensim.models.ldamodel.LdaModel.load('gensim_model.gensim')

import pyLDAvis.gensim_models

lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, gensim_corpus, gensim_dictionary, sort_topics=False)
pyLDAvis.display(lda_visualization)

  default_term_info = default_term_info.sort_values(
