# Topic modeling first part
This notebook contains the implementation for the following:
* Load company data into DataFrame
* Do feature preparation that inlcudes the following:
    - Remove words with fewer than 3 words 
    - Remove stop words
    - Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
    - Words are stemmed — words are reduced to their root form.
* Use LDA to compute most 


# Reference
[topic model machinelearningplus](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#4whatdoesldado)

[topic model towardsdatascience](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

## Step 1: Load data and necessary libraries and corpus

In [2]:

import pandas as pd

df = pd.read_csv("Sample 2001_2011/2001-2011-Table 1.csv")

In [3]:
df[['Company','Overview Text']].head()

Unnamed: 0,Company,Overview Text
0,Riverstone Networks,We are a leading provider of Internet infrastr...
1,Instinet Group LLC,We are the world's largest electronic agency s...
2,Alliance Data Systems,We are a leading provider of transaction servi...
3,Simplex Solutions,We provide software and services for integrate...
4,BAM! Entertainment,We are a rapidly emerging developer and publis...


In [4]:
import gensim
from gensim.models import CoherenceModel
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bohaocao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Step 2: data preprocessing

In [5]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    """
    Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
    Words are stemmed — words are reduced to their root form.
    """
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    """
    Words that have fewer than 3 characters are removed.
    All stopwords are removed.
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
processed_docs = df['Overview Text'].map(preprocess)

In [7]:
#processed docs are "features" that are preprocessed. Each Company overview text is transformed into a list of processed word now.
processed_docs

0      [lead, provid, internet, infrastructur, equip,...
1      [world, largest, electron, agenc, secur, broke...
2      [lead, provid, transact, servic, credit, servi...
3      [provid, softwar, servic, integr, circuit, des...
4      [rapid, emerg, develop, publish, interact, ent...
                             ...                        
117    [provid, home, improv, inform, servic, interne...
118    [oper, consum, drive, solut, member, research,...
119    [jive, mission, chang, work, get, believ, soci...
120    [boingo, make, simpl, connect, mobil, internet...
121    [lead, provid, carrier, neutral, coloc, data, ...
Name: Overview Text, Length: 122, dtype: object

In [8]:
"""
Create a dictionary from ‘processed_docs’ containing the preprocessed words and their ids.
"""
dictionary = gensim.corpora.Dictionary(processed_docs)
len(dictionary)

3215

In [9]:
"""
Filter:
less than 5 documents (absolute number) or
more than 0.5 documents (fraction of total corpus size, not absolute number).
after the above two steps, keep only the first 100000 most frequent tokens.
"""

dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
len(dictionary)

888

In [10]:
"""
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.
"""
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [11]:
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

In [12]:
def create_model_and_report_metrics(corpus, is_tfidf, num_topics, texts, id2word):
    """
    Create a LDA model, and calculate the coherence metrica and perplexity.
    If corpus is bag of words model, set is_tfidf = False
    Else if corpus is turned into tfidf, set is_tfidf = True
    """
    
    model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=id2word, passes=2, workers=4)

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"model artifacts: num_topics:{num_topics}, is_tfidf:{is_tfidf}")
    print(f'Coherence Score for: ', coherence_lda)

    # Compute perplexity. This is a measure of how good the model is. lower the better.
    print(f'Perplexity: {model.log_perplexity(corpus)}')  

    return model

In [13]:
num_topics_choices = [5,10,15,20,25,30]
for num_topics in num_topics_choices:
    create_model_and_report_metrics(bow_corpus, False, num_topics, processed_docs, dictionary)

model artifacts: num_topics:5, is_tfidf:False
Coherence Score for:  0.26322951992943405
Perplexity: -6.489503261554325
model artifacts: num_topics:10, is_tfidf:False
Coherence Score for:  0.314644582351184
Perplexity: -6.618414945326977
model artifacts: num_topics:15, is_tfidf:False
Coherence Score for:  0.28617206875863677
Perplexity: -6.761746520052021
model artifacts: num_topics:20, is_tfidf:False
Coherence Score for:  0.28637884310156664
Perplexity: -6.93826133993981
model artifacts: num_topics:25, is_tfidf:False
Coherence Score for:  0.27679669011219155
Perplexity: -7.0510043628415096
model artifacts: num_topics:30, is_tfidf:False
Coherence Score for:  0.3001195510235722
Perplexity: -7.184997439144871


## When topic is 10 and 30, the coherence score is highest. 
Reserve on 30 topics as it's too fragmented.
10 topic could be a good num of topic candidate.

In [14]:
num_topics_choices = [5,10,15,20,25, 30]
for num_topics in num_topics_choices:
    create_model_and_report_metrics(tfidf_corpus, True, num_topics, processed_docs, dictionary)

model artifacts: num_topics:5, is_tfidf:True
Coherence Score for:  0.31184845526771676
Perplexity: -9.020624400098527
model artifacts: num_topics:10, is_tfidf:True
Coherence Score for:  0.3198289966666304
Perplexity: -10.891994555185837
model artifacts: num_topics:15, is_tfidf:True
Coherence Score for:  0.3162272119929373
Perplexity: -12.493137176564023
model artifacts: num_topics:20, is_tfidf:True
Coherence Score for:  0.3016396412496134
Perplexity: -13.764095644214285
model artifacts: num_topics:25, is_tfidf:True
Coherence Score for:  0.29040513651075045
Perplexity: -14.52806967155888
model artifacts: num_topics:30, is_tfidf:True
Coherence Score for:  0.287488957952191
Perplexity: -15.260933775997827


With tfidf, coherence score is pretty stable but perplexity is getter better and better.

Pick num of topic *10* for both models.

In [15]:
corpus_10 = create_model_and_report_metrics(bow_corpus, False, 10, processed_docs, dictionary)

model artifacts: num_topics:10, is_tfidf:False
Coherence Score for:  0.29702969503446597
Perplexity: -6.639037666593046


In [16]:
corpus_tfidf_10 = create_model_and_report_metrics(tfidf_corpus, True, 10, processed_docs, dictionary)

model artifacts: num_topics:10, is_tfidf:True
Coherence Score for:  0.2885590586054497
Perplexity: -10.699626170300846


## Step 3 :Visualize the topics

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.


In [18]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(corpus_10, bow_corpus, dictionary)
vis

In [20]:
vis = pyLDAvis.gensim.prepare(corpus_tfidf_10, tfidf_corpus, dictionary)
vis

The screenshots folder shows visuliation of the intertopic distance map for the 10 topic model and 10 topic model with tfidf

You can see that normal 10 topic model's topics are more evenly spread so this is the chosen model for the following steps.
## Print topics for model 10


In [21]:
corpus_10.print_topics()

[(0,
  '0.021*"data" + 0.016*"inform" + 0.015*"health" + 0.014*"consum" + 0.009*"financ" + 0.008*"user" + 0.008*"content" + 0.008*"person" + 0.008*"healthcar" + 0.007*"activ"'),
 (1,
  '0.012*"email" + 0.011*"user" + 0.011*"inform" + 0.010*"profession" + 0.009*"payment" + 0.008*"advertis" + 0.008*"small" + 0.008*"game" + 0.008*"search" + 0.007*"merchant"'),
 (2,
  '0.025*"mobil" + 0.013*"user" + 0.010*"connect" + 0.009*"enterpris" + 0.009*"advertis" + 0.009*"data" + 0.007*"global" + 0.007*"deliv" + 0.007*"march" + 0.006*"secur"'),
 (3,
  '0.012*"advertis" + 0.011*"email" + 0.010*"mobil" + 0.009*"billion" + 0.009*"secur" + 0.009*"consum" + 0.008*"data" + 0.008*"fiscal" + 0.007*"music" + 0.007*"digit"'),
 (4,
  '0.020*"improv" + 0.019*"home" + 0.016*"process" + 0.014*"project" + 0.009*"advertis" + 0.008*"supplier" + 0.008*"design" + 0.008*"inform" + 0.007*"platform" + 0.006*"billion"'),
 (5,
  '0.020*"purchas" + 0.019*"payment" + 0.015*"supplier" + 0.015*"hotel" + 0.012*"order" + 0.011*"

In [50]:
lda_model = corpus_10
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8113599419593811	 
Topic: 0.008*"process" + 0.008*"payment" + 0.007*"price" + 0.007*"enterpris" + 0.007*"order" + 0.006*"transact" + 0.006*"retail" + 0.006*"data" + 0.006*"design" + 0.006*"manufactur"

Score: 0.17300157248973846	 
Topic: 0.020*"client" + 0.010*"marketplac" + 0.009*"home" + 0.009*"member" + 0.008*"mobil" + 0.008*"digit" + 0.008*"quarter" + 0.008*"septemb" + 0.007*"email" + 0.007*"local"


In [61]:
s = """0.008*"process" + 0.008*"payment" + 0.007*"price" + 0.007*"enterpris" + 0.007*"order" + 0.006*"transact" + 0.006*"retail" + 0.006*"data" + 0.006*"design" + 0.006*"manufactur"""
[*s.split("+")][0].split("*")[1].strip()

'"process"'

## Output topic

In [22]:
# bow_corpus
# corpus_tfidf

def output_topics(model, is_tfidf):
    corpus = corpus_tfidf if is_tfidf else bow_corpus
    d = {"index": [], "score":[], "topic":[], "prob": []}
    for item_index, item in enumerate(corpus):
        for index, score in sorted(model[item], key=lambda tup: -1*tup[1]):
            topic_str = model.print_topic(index, 10)
            #print(topic_str)
            topic_scores_str = [*topic_str.split("+")]
            #print(topic_scores_str)
            for topic_score_str in topic_scores_str:
                s =[*topic_score_str.split("*")]
                #print(s)
                prob = float(s[0].strip())
                topic = s[1].strip()
                d['index'].append(item_index)
                d['score'].append(score)
                d['topic'].append(topic)
                d['prob'].append(prob)
            break
    return d

In [24]:
d = output_topics(corpus_10, False)
corpus_10_result = pd.DataFrame(d)
corpus_10_result

Unnamed: 0,index,score,topic,prob
0,0,0.986761,"""content""",0.012
1,0,0.986761,"""design""",0.011
2,0,0.986761,"""integr""",0.011
3,0,0.986761,"""data""",0.011
4,0,0.986761,"""advertis""",0.011
...,...,...,...,...
1215,121,0.616209,"""data""",0.009
1216,121,0.616209,"""global""",0.007
1217,121,0.616209,"""deliv""",0.007
1218,121,0.616209,"""march""",0.007


In [119]:
d = output_topics(model_10, True)
model_10_result = pd.DataFrame(d)
model_10_result

Unnamed: 0,index,score,topic,prob
0,0,0.861899,"""client""",0.018
1,0,0.861899,"""channel""",0.017
2,0,0.861899,"""sell""",0.014
3,0,0.861899,"""tradit""",0.013
4,0,0.861899,"""interact""",0.013
...,...,...,...,...
1215,121,0.868080,"""perform""",0.015
1216,121,0.868080,"""platform""",0.013
1217,121,0.868080,"""design""",0.012
1218,121,0.868080,"""demand""",0.011
