Source : https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

This notebook carries out topic modelling on a dataset of news articles using the Gensim implementation of LAtent Dirichlet Allocation (LDA). 

Topic Modeling is a technique to extract the hidden topics from large volumes of text. Latent Dirichlet Allocation(LDA) is a popular algorithm for topic modeling with excellent implementations in the Python’s Gensim package. 

In [13]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/albertstaszak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [15]:
# NLTK Stop words -> We will filter these words out of our dataset so that they do not impact the results.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [16]:
# Import Dataset
import csv
import pandas as pd
#We are using the following dataset -> https://www.kaggle.com/snapcrack/all-the-news
csv_file = open('./csv/articles1.csv', 'r')
df = pd.read_csv(csv_file)
articles = df.content

In [17]:
# Convert to list
data = articles.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['WASHINGTON — Congressional Republicans have a new fear when it comes to '
 'their health care lawsuit against the Obama administration: They might win. '
 'The incoming Trump administration could choose to no longer defend the '
 'executive branch against the suit, which challenges the administration’s '
 'authority to spend billions of dollars on health insurance subsidies for and '
 'Americans, handing House Republicans a big victory on issues. But a sudden '
 'loss of the disputed subsidies could conceivably cause the health care '
 'program to implode, leaving millions of people without access to health '
 'insurance before Republicans have prepared a replacement. That could lead to '
 'chaos in the insurance market and spur a political backlash just as '
 'Republicans gain full control of the government. To stave off that outcome, '
 'Republicans could find themselves in the awkward position of appropriating '
 'huge sums to temporarily prop up the Obama health care law, angerin

In [18]:
# Convert each document into list of individual words, remove punctuation.
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['washington', 'congressional', 'republicans', 'have', 'new', 'fear', 'when', 'it', 'comes', 'to', 'their', 'health', 'care', 'lawsuit', 'against', 'the', 'obama', 'administration', 'they', 'might', 'win', 'the', 'incoming', 'trump', 'administration', 'could', 'choose', 'to', 'no', 'longer', 'defend', 'the', 'executive', 'branch', 'against', 'the', 'suit', 'which', 'challenges', 'the', 'administration', 'authority', 'to', 'spend', 'billions', 'of', 'dollars', 'on', 'health', 'insurance', 'subsidies', 'for', 'and', 'americans', 'handing', 'house', 'republicans', 'big', 'victory', 'on', 'issues', 'but', 'sudden', 'loss', 'of', 'the', 'disputed', 'subsidies', 'could', 'conceivably', 'cause', 'the', 'health', 'care', 'program', 'to', 'implode', 'leaving', 'millions', 'of', 'people', 'without', 'access', 'to', 'health', 'insurance', 'before', 'republicans', 'have', 'prepared', 'replacement', 'that', 'could', 'lead', 'to', 'chaos', 'in', 'the', 'insurance', 'market', 'and', 'spur', 'politic

In [19]:
# We will build bigram and trigrams in order to combine words which often occur together into a single word 
# eg.(White House becomes white-house).

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['washington', 'congressional', 'republicans', 'have', 'new', 'fear', 'when', 'it', 'comes', 'to', 'their', 'health_care', 'lawsuit', 'against', 'the', 'obama', 'administration', 'they', 'might', 'win', 'the', 'incoming', 'trump', 'administration', 'could', 'choose', 'to', 'no', 'longer', 'defend', 'the', 'executive_branch', 'against', 'the', 'suit', 'which', 'challenges', 'the', 'administration', 'authority', 'to', 'spend', 'billions', 'of', 'dollars', 'on', 'health_insurance', 'subsidies', 'for', 'and', 'americans', 'handing', 'house', 'republicans', 'big', 'victory', 'on', 'issues', 'but', 'sudden', 'loss', 'of', 'the', 'disputed', 'subsidies', 'could', 'conceivably', 'cause', 'the', 'health_care', 'program', 'to', 'implode', 'leaving', 'millions', 'of', 'people', 'without', 'access', 'to', 'health_insurance', 'before', 'republicans', 'have', 'prepared', 'replacement', 'that', 'could', 'lead', 'to', 'chaos', 'in', 'the', 'insurance', 'market', 'and', 'spur', 'political', 'backlash',

In [20]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [21]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['washington', 'congressional', 'republican', 'new', 'fear', 'come', 'health_care', 'lawsuit', 'obama', 'administration', 'may', 'win', 'incoming', 'trump', 'administration', 'could', 'choose', 'longer', 'defend', 'executive_branch', 'suit', 'challenge', 'administration', 'authority', 'spend', 'billion', 'dollar', 'health_insurance', 'subsidy', 'american', 'hand', 'house', 'republican', 'big', 'victory', 'issue', 'sudden', 'loss', 'dispute', 'subsidy', 'could', 'conceivably', 'cause', 'health_care', 'program', 'implode', 'leave', 'million', 'people', 'access', 'health_insurance', 'republican', 'prepared', 'replacement', 'could', 'lead', 'chaos', 'insurance', 'market', 'spur', 'political', 'backlash', 'republican', 'gain', 'full', 'control', 'government', 'stave', 'outcome', 'republican', 'could', 'find', 'awkward', 'position', 'appropriate', 'huge_sum', 'temporarily', 'prop', 'obama', 'health_care', 'law', 'anger', 'conservative', 'voter', 'demand', 'end', 'law', 'year', 'twist', 'don

In [22]:
# We will compute the frequency with which each word occurs in a document.

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('access', 1), ('acknowledge', 1), ('act', 1), ('administration', 13), ('advocate', 1), ('affordable_care', 1), ('ally', 1), ('american', 1), ('anger', 1), ('annual', 1), ('anticipate', 2), ('appeal', 4), ('appropriate', 2), ('appropriation', 1), ('approval', 1), ('aspect', 1), ('assert', 1), ('authority', 2), ('avoid', 1), ('awkward', 1), ('backlash', 1), ('behalf', 1), ('big', 1), ('billion', 1), ('blando', 1), ('broad', 1), ('capitol_hill', 1), ('cascade', 1), ('case', 4), ('cause', 2), ('central', 1), ('challenge', 1), ('champion', 1), ('chaos', 1), ('choose', 2), ('collyer', 3), ('columbia_circuit', 1), ('come', 3), ('comment', 1), ('committee', 1), ('complicate', 1), ('conceivably', 1), ('concept', 1), ('confidence', 1), ('confident', 1), ('congress', 5), ('congressional', 3), ('consequence', 1), ('conservative', 1), ('consider', 1), ('constitution', 3), ('consumer', 1), ('contend', 1), ('continue', 1), ('control', 1), ('cost', 2), ('could', 8), ('court', 2), ('coverage', 1), (

In [23]:
# TODO: WE MUST FIND MODEL WHICH MAXIMISES CONERENCE SCORE - configurable params: https://radimrehurek.com/gensim/models/wrappers/ldamallet.html
# CONFIGURE: 
# - num_topics (MAIN CONFIGURABLE PARAM)
# - iterations
# - topic threshold
def createModelAndComputeCoherence(minTopics, maxTopics, passes, chunkSize):
    # Build LDA model
    for topics in range(minTopics, maxTopics):
        print("Model with", topics, "topics,", passes, "passes &", chunkSize, "chunksize.")
        mallet_path = './mallet-2.0.8/bin/mallet' # update this path
        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=topics, id2word=id2word)
        # Show Topics
        pprint(ldamallet.show_topics(formatted=False))

        # Compute Coherence Score
        coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_ldamallet = coherence_model_ldamallet.get_coherence()
        print('\nCoherence Score: ', coherence_ldamallet)
        # Visualize the topics
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(ldamallet, corpus, id2word)
        vis
#         lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                id2word=id2word,
#                                                num_topics=topics, 
#                                                random_state=100,
#                                                update_every=1,
#                                                chunksize=chunkSize,
#                                                passes=passes,
#                                                alpha='auto',
#                                                per_word_topics=True)
#         # Print the Keyword in the 10 topics
#         pprint(lda_model.print_topics())
#         doc_lda = lda_model[corpus]
#         # Compute Perplexity
#         print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
#         # Compute Coherence Score
#         coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
#         coherence_lda = coherence_model_lda.get_coherence()
#         print('\nCoherence Score: ', coherence_lda)


In [24]:
#createModelAndComputeCoherence(22,23,10,100)

In [25]:
mallet_path = './mallet-2.0.8/bin/mallet' # update this path
optimal_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=24, id2word=id2word)
# Show Topics
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=optimal_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)



[(7,
  '0.023*"case" + 0.020*"charge" + 0.015*"court" + 0.010*"prison" + '
  '0.009*"judge" + 0.009*"year" + 0.009*"attorney" + 0.009*"lawyer" + '
  '0.009*"prosecutor" + 0.008*"crime"'),
 (9,
  '0.025*"city" + 0.013*"york" + 0.012*"day" + 0.012*"home" + 0.010*"people" + '
  '0.007*"street" + 0.007*"building" + 0.007*"park" + 0.007*"place" + '
  '0.007*"time"'),
 (23,
  '0.023*"game" + 0.020*"team" + 0.014*"play" + 0.011*"player" + 0.010*"win" + '
  '0.010*"year" + 0.010*"sport" + 0.008*"time" + 0.008*"world" + '
  '0.007*"season"'),
 (22,
  '0.025*"attack" + 0.015*"isis" + 0.014*"group" + 0.014*"terrorist" + '
  '0.014*"islamic" + 0.013*"syria" + 0.012*"state" + 0.012*"military" + '
  '0.012*"kill" + 0.011*"force"'),
 (5,
  '0.010*"book" + 0.008*"work" + 0.008*"time" + 0.007*"write" + 0.007*"world" '
  '+ 0.006*"make" + 0.006*"year" + 0.005*"art" + 0.005*"read" + 0.005*"image"'),
 (15,
  '0.044*"woman" + 0.031*"family" + 0.026*"child" + 0.018*"man" + 0.017*"life" '
  '+ 0.013*"young" 

In [30]:
# One of the practical application of topic modeling is to determine what topic a given document is about.
# To find that, we find the topic number that has the highest percentage contribution in that document.

def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(100)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,21.0,0.3813,"republican, bill, house, senate, democrat, pre...",WASHINGTON — Congressional Republicans have a ...
1,1,4.0,0.3132,"police, officer, man, gun, kill, shoot, report...","After the bullet shells get counted, the blood..."
2,2,5.0,0.3845,"book, work, time, write, world, make, year, ar...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,14.0,0.2284,"show, film, star, play, year, good, movie, ser...","Death may be the great equalizer, but it isn’t..."
4,4,12.0,0.5800,"china, country, president, russia, iran, israe...","SEOUL, South Korea — North Korea’s leader, Kim..."
5,5,17.0,0.1584,"party, country, europe, leave, migrant, britai...","LONDON — Queen Elizabeth II, who has been batt..."
6,6,12.0,0.5265,"china, country, president, russia, iran, israe...",BEIJING — President Tsai of Taiwan sharply cri...
7,7,13.0,0.5339,"study, health, drug, medical, find, people, ca...","Danny Cahill stood, slightly dazed, in a blizz..."
8,8,5.0,0.2403,"book, work, time, write, world, make, year, ar...","Just how is Hillary Kerr, the founder of a dig..."
9,9,15.0,0.4035,"woman, family, child, man, life, young, father...",Angels are everywhere in the Muñiz family’s ap...


In [27]:
# Sometimes just the topic keywords may not be enough to make sense of what a topic is about. 
# So, to help with understanding the topic, you can find the documents a given topic has contributed
# to the most and infer the topic by reading that document. 

# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.7716,"immigration, country, texas, border, report, m...","MATAMOROS, Tamaulipas — Los líderes de dos de ..."
1,1.0,0.5558,"news, breitbart, twitter, medium, post, follow...",Most of the mainstream media and the tech jour...
2,2.0,0.7655,"water, cnn, year, area, plane, fire, climate_c...",(CNN) Here is a look at the 2016 Atlantic hur...
3,3.0,0.7388,"clinton, trump, campaign, republican, candidat...","On Tuesday, Republicans in Idaho, Hawaii, Mich..."
4,4.0,0.8178,"police, officer, man, gun, kill, shoot, report...","’’ ’Police violence against civilians, particu..."


In [28]:
# Finally, we want to understand the volume and distribution 
# of topics in order to judge how widely it was discussed. 
# The below table exposes that information.

# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,21.0,"republican, bill, house, senate, democrat, pre...",1887.0,0.0377
1,4.0,"police, officer, man, gun, kill, shoot, report...",2595.0,0.0519
2,5.0,"book, work, time, write, world, make, year, ar...",2019.0,0.0404
3,14.0,"show, film, star, play, year, good, movie, ser...",4234.0,0.0847
4,12.0,"china, country, president, russia, iran, israe...",3074.0,0.0615
5,17.0,"party, country, europe, leave, migrant, britai...",1350.0,0.0270
6,12.0,"china, country, president, russia, iran, israe...",1231.0,0.0246
7,13.0,"study, health, drug, medical, find, people, ca...",1503.0,0.0301
8,5.0,"book, work, time, write, world, make, year, ar...",2074.0,0.0415
9,15.0,"woman, family, child, man, life, young, father...",1178.0,0.0236


In [31]:
# Visualize the topics
pyLDAvis.enable_notebook()
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(optimal_model)
vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
vis

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
