In [3]:
# Reference
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#4whatdoesldado
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
import pandas as pd

df = pd.read_csv("Sample 2001_2011/2001-2011-Table 1.csv")

In [4]:
df[['Company','Overview Text']]

Unnamed: 0,Company,Overview Text
0,Riverstone Networks,We are a leading provider of Internet infrastr...
1,Instinet Group LLC,We are the world's largest electronic agency s...
2,Alliance Data Systems,We are a leading provider of transaction servi...
3,Simplex Solutions,We provide software and services for integrate...
4,BAM! Entertainment,We are a rapidly emerging developer and publis...
...,...,...
117,Imperva,We provide home improvement information and se...
118,Angie's List,We operate a consumer-driven solution for our ...
119,Jive Software,Jive’s mission is to change the way that work ...
120,Boingo Wireless Inc,Boingo makes it simple to connect to the mobil...


In [5]:
import gensim
from gensim.models import CoherenceModel
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bohaocao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    """
    Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
    Words are stemmed — words are reduced to their root form.
    """
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    """
    Words that have fewer than 3 characters are removed.
    All stopwords are removed.
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [7]:
processed_docs = df['Overview Text'].map(preprocess)

In [8]:
"""
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.
Filter:
less than 15 documents (absolute number) or
more than 0.5 documents (fraction of total corpus size, not absolute number).
after the above two steps, keep only the first 100000 most frequent tokens.
"""
dictionary = gensim.corpora.Dictionary(processed_docs)

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [9]:
"""
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.
"""
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [10]:
def create_model_and_report_metrics(corpus, is_tfidf, num_topics, texts, id2word):
    model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=id2word, passes=2, workers=4)

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"model artifacts: num_topics:{num_topics}, is_tfidf:{is_tfidf}")
    print(f'Coherence Score for: ', coherence_lda)

    # a measure of how good the model is. lower the better.
    print(f'Perplexity: {model.log_perplexity(corpus)}')  

    return model


In [12]:
num_topics_choices = [5,10,15,20,25,30]
for num_topics in num_topics_choices:
    create_model_and_report_metrics(bow_corpus, False, num_topics, processed_docs, dictionary)

model artifacts: num_topics:5, is_tfidf:False
Coherence Score for:  0.2720587551107571
Perplexity: -5.7503308697809254
model artifacts: num_topics:10, is_tfidf:False
Coherence Score for:  0.2978269505678698
Perplexity: -5.827677283437225
model artifacts: num_topics:15, is_tfidf:False
Coherence Score for:  0.2972552215639094
Perplexity: -5.942355759226626
model artifacts: num_topics:20, is_tfidf:False
Coherence Score for:  0.28326549617851404
Perplexity: -6.08299227299289
model artifacts: num_topics:25, is_tfidf:False
Coherence Score for:  0.2987650974396573
Perplexity: -6.197751448790713
model artifacts: num_topics:30, is_tfidf:False
Coherence Score for:  0.28256270775351816
Perplexity: -6.3431941273718655


In [12]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [15]:
num_topics_choices = [5,10,15,20,25, 30]
for num_topics in num_topics_choices:
    create_model_and_report_metrics(corpus_tfidf, True, num_topics, processed_docs, dictionary)

model artifacts: num_topics:5, is_tfidf:True
Coherence Score for:  0.22482466034977894
Perplexity: -7.621217156608712
model artifacts: num_topics:10, is_tfidf:True
Coherence Score for:  0.24591223680551427
Perplexity: -9.033099534341979
model artifacts: num_topics:15, is_tfidf:True
Coherence Score for:  0.24636828108698008
Perplexity: -10.669660227886535
model artifacts: num_topics:20, is_tfidf:True
Coherence Score for:  0.2208224812445641
Perplexity: -10.61681605558313
model artifacts: num_topics:25, is_tfidf:True
Coherence Score for:  0.2439456411198218
Perplexity: -12.040754125760753
model artifacts: num_topics:30, is_tfidf:True
Coherence Score for:  0.2583301841544974
Perplexity: -12.314428811655548


In [15]:
corpus_tfidf_8 = create_model_and_report_metrics(corpus_tfidf, True, 8, processed_docs, dictionary)

model artifacts: num_topics:8, is_tfidf:True
Coherence Score for:  0.24033243088083076
Perplexity: -8.291428334698


In [27]:
corpus_tfidf_15 = create_model_and_report_metrics(corpus_tfidf, True, 15, processed_docs, dictionary)

model artifacts: num_topics:15, is_tfidf:True
Coherence Score for:  0.22033667877168436
Perplexity: -10.50494640902069


In [17]:
corpus_tfidf_5 = create_model_and_report_metrics(corpus_tfidf, True, 5, processed_docs, dictionary)

model artifacts: num_topics:5, is_tfidf:True
Coherence Score for:  0.2525193179978911
Perplexity: -7.468032459587686


In [24]:
model_5 = create_model_and_report_metrics(bow_corpus, False, 5, processed_docs, dictionary)

model artifacts: num_topics:5, is_tfidf:False
Coherence Score for:  0.24266101376693414
Perplexity: -5.731501961453066


In [20]:
# Visualize the topics
# A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(corpus_tfidf_8, corpus_tfidf, dictionary)
vis

In [30]:
vis = pyLDAvis.gensim.prepare(corpus_tfidf_5, corpus_tfidf, dictionary)
vis

In [26]:
vis = pyLDAvis.gensim.prepare(model_5, bow_corpus, dictionary)
vis

In [31]:
model_10 = create_model_and_report_metrics(bow_corpus, False, 10, processed_docs, dictionary)
vis = pyLDAvis.gensim.prepare(model_10, bow_corpus, dictionary)
vis

model artifacts: num_topics:10, is_tfidf:False
Coherence Score for:  0.3301357369722567
Perplexity: -5.815200315092786


In [34]:
model_10.print_topics()

[(0,
  '0.030*"user" + 0.028*"advertis" + 0.028*"mobil" + 0.017*"connect" + 0.014*"devic" + 0.012*"consum" + 0.012*"media" + 0.012*"comput" + 0.011*"design" + 0.011*"platform"'),
 (1,
  '0.021*"content" + 0.019*"purchas" + 0.017*"user" + 0.017*"process" + 0.016*"inform" + 0.014*"mobil" + 0.012*"search" + 0.011*"consum" + 0.011*"advertis" + 0.011*"interact"'),
 (2,
  '0.026*"payment" + 0.025*"process" + 0.017*"account" + 0.014*"transact" + 0.012*"secur" + 0.012*"host" + 0.012*"system" + 0.012*"total" + 0.011*"design" + 0.010*"quarter"'),
 (3,
  '0.035*"client" + 0.026*"invest" + 0.022*"financi" + 0.021*"incom" + 0.021*"platform" + 0.019*"total" + 0.016*"fiscal" + 0.016*"secur" + 0.015*"loss" + 0.014*"account"'),
 (4,
  '0.029*"septemb" + 0.026*"retail" + 0.020*"capit" + 0.017*"gain" + 0.016*"repres" + 0.016*"decemb" + 0.015*"platform" + 0.013*"respect" + 0.013*"volum" + 0.012*"total"'),
 (5,
  '0.027*"project" + 0.025*"improv" + 0.023*"advertis" + 0.021*"inform" + 0.020*"home" + 0.013*"

In [50]:
lda_model = corpus_tfidf_5
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8113599419593811	 
Topic: 0.008*"process" + 0.008*"payment" + 0.007*"price" + 0.007*"enterpris" + 0.007*"order" + 0.006*"transact" + 0.006*"retail" + 0.006*"data" + 0.006*"design" + 0.006*"manufactur"

Score: 0.17300157248973846	 
Topic: 0.020*"client" + 0.010*"marketplac" + 0.009*"home" + 0.009*"member" + 0.008*"mobil" + 0.008*"digit" + 0.008*"quarter" + 0.008*"septemb" + 0.007*"email" + 0.007*"local"


In [61]:
s = """0.008*"process" + 0.008*"payment" + 0.007*"price" + 0.007*"enterpris" + 0.007*"order" + 0.006*"transact" + 0.006*"retail" + 0.006*"data" + 0.006*"design" + 0.006*"manufactur"""
[*s.split("+")][0].split("*")[1].strip()

'"process"'

In [117]:
# bow_corpus
# corpus_tfidf

def output_topics(model, is_tfidf):
    corpus = corpus_tfidf if is_tfidf else bow_corpus
    d = {"index": [], "score":[], "topic":[], "prob": []}
    for item_index, item in enumerate(corpus):
        for index, score in sorted(model[item], key=lambda tup: -1*tup[1]):
            topic_str = model.print_topic(index, 10)
            #print(topic_str)
            topic_scores_str = [*topic_str.split("+")]
            #print(topic_scores_str)
            for topic_score_str in topic_scores_str:
                s =[*topic_score_str.split("*")]
                #print(s)
                prob = float(s[0].strip())
                topic = s[1].strip()
                d['index'].append(item_index)
                d['score'].append(score)
                d['topic'].append(topic)
                d['prob'].append(prob)
            break
    return d

In [118]:
d = output_topics(corpus_tfidf_8, True)
corpus_tfidf_8_result = pd.DataFrame(d)
corpus_tfidf_8_result

Unnamed: 0,index,score,topic,prob
0,0,0.675611,"""client""",0.013
1,0,0.675611,"""advertis""",0.011
2,0,0.675611,"""member""",0.009
3,0,0.675611,"""communic""",0.009
4,0,0.675611,"""interact""",0.008
...,...,...,...,...
1215,121,0.871492,"""email""",0.008
1216,121,0.871492,"""channel""",0.008
1217,121,0.871492,"""digit""",0.008
1218,121,0.871492,"""profession""",0.008


In [119]:
d = output_topics(model_10, True)
model_10_result = pd.DataFrame(d)
model_10_result

Unnamed: 0,index,score,topic,prob
0,0,0.861899,"""client""",0.018
1,0,0.861899,"""channel""",0.017
2,0,0.861899,"""sell""",0.014
3,0,0.861899,"""tradit""",0.013
4,0,0.861899,"""interact""",0.013
...,...,...,...,...
1215,121,0.868080,"""perform""",0.015
1216,121,0.868080,"""platform""",0.013
1217,121,0.868080,"""design""",0.012
1218,121,0.868080,"""demand""",0.011


In [121]:
model_10_result[model_10_result['index']==1]

Unnamed: 0,index,score,topic,prob
10,1,0.901234,"""payment""",0.026
11,1,0.901234,"""process""",0.025
12,1,0.901234,"""account""",0.017
13,1,0.901234,"""transact""",0.014
14,1,0.901234,"""secur""",0.012
15,1,0.901234,"""host""",0.012
16,1,0.901234,"""system""",0.012
17,1,0.901234,"""total""",0.012
18,1,0.901234,"""design""",0.011
19,1,0.901234,"""quarter""",0.01
