In [94]:
# Source: https://www.oneoffcoder.com/2019/10/02/latent-dirichlet-allocation/#topic=0&lambda=0.01&term=

import re

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go



In [63]:
def clean(text):
    t = text.lower().strip()
    t = t.split()
    t = remove_stop_words(t)
    t = [get_lemma(w) for w in t]
    t = [get_stem(w) for w in t]
    return t

def get_stem(w):
    return PorterStemmer().stem(w)
    
def get_lemma(w):
    lemma = wn.morphy(w)
    return w if lemma is None else lemma
    
def remove_stop_words(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stop_words]



def learn_lda_model(corpus, dictionary, k):
    lda = LdaModel(corpus, 
               id2word=dictionary, 
               num_topics=k, 
               random_state=37, 
               iterations=100,
               passes=5,
               per_word_topics=False)
    cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    print('{}: {}'.format(k, coherence))
    return k, coherence

def get_kmc(X, k):
    model = KMeans(k, random_state=37)
    model.fit(X)
    labels = model.predict(X)
    score = silhouette_score(X, labels)
    print('{}: score={}'.format(k, score))
    return k, score


In [95]:
texts = [
    'The Art of Computer Programming',
    'Computer Programming Learn Any Programming Language In 2 Hours',
    'The Self-Taught Programmer The Definitive Guide to Programming Professionally',
    'The Complete Software Developers Career Guide How to Learn Your Next Programming Language',
    'Cracking the Coding Interview 189 Programming Questions and Solutions',
    'The Economics Book Big Ideas Simply Explained',
    'Economics in One Lesson The Shortest and Surest Way to Understand Basic Economics',
    'Basic Economics',
    'Aftermath Seven Secrets of Wealth Preservation in the Coming Chaos',
    'Economics 101 From Consumer Behavior to Competitive Markets Everything You Need to Know About Economics'
]

texts = [clean(t) for t in texts]

dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=3)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_scores = [learn_lda_model(corpus, dictionary, k) for k in range(1, 10)]

1: -26.714730384054395
2: -26.8263021597115
3: -26.863492751597203
4: -26.88208804754005
5: -26.848616514842924
6: -26.9006833434829
7: -26.874118634993117
8: -26.88208804754005
9: -26.863492751597203


In [96]:
lda_scores_df = pd.DataFrame(lda_scores, columns=['Number of Topics', 'LDA Score'])
kmc_scores_df = pd.DataFrame(kmc_scores, columns=['Number of Topics', 'KMC Score'])
df = lda_scores_df.merge(kmc_scores_df, on='Number of Topics')
df

Unnamed: 0,Number of Topics,LDA Score,KMC Score
0,2,-26.826302,0.072128
1,3,-26.863493,0.056378
2,4,-26.882088,0.053284
3,5,-26.848617,0.055351
4,6,-26.900683,0.055793
5,7,-26.874119,0.060051
6,8,-26.882088,0.047514


In [104]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=False)
first_line = go.Scatter(x=df["Number of Topics"], y=df["LDA Score"], name="LDA")
second_line = go.Scatter(x=df["Number of Topics"], y=df["KMC Score"], name="KMC")
fig = make_subplots(rows=1, cols=3, shared_yaxes=False)
fig.add_trace(first_line, row=1, col=1)
fig.add_trace(second_line, row=1, col=2)
fig.show()

In [97]:
fig = px.line(df, x="Number of Topics", y="KMC Score", title='Number of Topics',
              template='simple_white')
fig.show()

In [98]:
fig = px.line(df, x="Number of Topics", y="LDA Score", title='Number of Topics',
              template='simple_white')
fig.show()

# Toy Example

In [48]:
texts = [
    'The Art of Computer Programming',
    'Computer Programming Learn Any Programming Language In 2 Hours',
    'The Self-Taught Programmer The Definitive Guide to Programming Professionally',
    'The Complete Software Developers Career Guide How to Learn Your Next Programming Language',
    'Cracking the Coding Interview 189 Programming Questions and Solutions',
    'The Economics Book Big Ideas Simply Explained',
    'Economics in One Lesson The Shortest and Surest Way to Understand Basic Economics',
    'Basic Economics',
    'Aftermath Seven Secrets of Wealth Preservation in the Coming Chaos',
    'Economics 101 From Consumer Behavior to Competitive Markets Everything You Need to Know About Economics'
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
X = pd.DataFrame.sparse.from_spmatrix(X)
kmc_scores = [get_kmc(X, k) for k in range(2, 7)]


2: score=0.07212813769982696
3: score=0.05637816655670859
4: score=0.053284212839352085
5: score=0.05535141570965124
6: score=0.05579317332890603
