- https://nlpforhackers.io/topic-modeling/

In [46]:
from nltk.corpus import brown
import re
from gensim import models, corpora, similarities
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 

  cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]


In [48]:
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])
 

500


In [49]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')


In [50]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

In [51]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [52]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 

In [53]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))

print("=" * 20)

LDA Model:
Topic #0: 0.007*"one" + 0.006*"would" + 0.004*"said" + 0.003*"new" + 0.003*"could" + 0.003*"time" + 0.003*"first" + 0.002*"may" + 0.002*"man" + 0.002*"like"
Topic #1: 0.006*"one" + 0.004*"would" + 0.003*"new" + 0.003*"said" + 0.002*"could" + 0.002*"time" + 0.002*"two" + 0.002*"must" + 0.002*"like" + 0.002*"states"
Topic #2: 0.007*"one" + 0.005*"would" + 0.004*"said" + 0.004*"new" + 0.004*"could" + 0.003*"two" + 0.003*"time" + 0.002*"may" + 0.002*"man" + 0.002*"made"
Topic #3: 0.006*"one" + 0.004*"time" + 0.003*"would" + 0.003*"said" + 0.003*"new" + 0.003*"could" + 0.003*"like" + 0.002*"first" + 0.002*"two" + 0.002*"may"
Topic #4: 0.006*"one" + 0.006*"would" + 0.004*"said" + 0.003*"time" + 0.003*"could" + 0.003*"may" + 0.003*"new" + 0.003*"like" + 0.003*"man" + 0.003*"first"
Topic #5: 0.006*"one" + 0.004*"would" + 0.004*"said" + 0.004*"two" + 0.003*"new" + 0.003*"could" + 0.003*"also" + 0.002*"time" + 0.002*"first" + 0.002*"man"
Topic #6: 0.008*"would" + 0.007*"one" + 0.004*"

In [55]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
 
print(lsi_model[bow])
print('====================================')
print(lda_model[bow])
 

[(0, 0.09161428054263644), (1, -0.00871731999597386), (2, 0.015810148195797076), (3, -0.04066099736678477), (4, -0.01665715646684495), (5, 0.009101659642573902), (6, -0.02779513112992049), (7, 0.01762557521170914), (8, -0.05778956752315291), (9, 0.023516308712881115)]
[(0, 0.020016195), (1, 0.020014942), (2, 0.020014906), (3, 0.020015206), (4, 0.020015078), (5, 0.02001569), (6, 0.020015618), (7, 0.020014612), (8, 0.8198629), (9, 0.020014845)]


In [56]:
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities_output = lda_index[lda_model[bow]]
# Sort the similarities
similarities_output = sorted(enumerate(similarities_output), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities_output[:10])

print('--------------------------------------------------------------')

# Let's see what's the most similar document
document_id, similarities_output = similarities_output[0]
print(data[document_id][:1000])

[(52, 0.99833953), (142, 0.9979115), (56, 0.99762577), (466, 0.9976206), (255, 0.99758095), (164, 0.9975707), (340, 0.9974807), (216, 0.99734056), (51, 0.9973291), (157, 0.9973291)]
--------------------------------------------------------------
Mr. Podger always particularly enjoyed the last night of each summer at Loon Lake . The narrow fringe of sadness that ran around it only emphasized the pleasure . The evening was not always spent in the same way . This year , on a night cool with the front of September moving in , but with plenty of summer still about , the Podgers were holding a neighborhood gathering in the Pod . The little cottage was bursting with people of all ages . In the midst of it all , Mr. Podger came out on the Pod porch , alone . He had that day attended a country auction , and he had come back with a prize . The prize was an old-fashioned , woven cloth hammock , complete with cross-top pillow , fringed side pieces , and hooks for hanging . Mrs. Podger had obligingl

In [57]:
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)


  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [58]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

print('--------------------------------------')

# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
print('--------------------------------------')
    
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 

print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

(500, 10)
--------------------------------------
(500, 10)
--------------------------------------
(500, 10)
[1.05611010e-04 7.87644856e-01 2.97657508e-02 1.05615506e-04
 1.05596678e-04 1.05596779e-04 1.81850163e-01 1.05600432e-04
 1.05596867e-04 1.05613133e-04]
[0.         0.         2.11731425 0.07701484 0.         0.54281141
 1.07211648 0.         0.         0.24516135]
[ 2.33068429e+01  1.59544150e+00  2.17925338e+01  1.17139929e-02
  8.59557339e-01  1.13716347e+01  4.07174629e+00 -2.11182487e+00
  1.69629637e+00 -1.41412893e+01]


In [59]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
print('--------------------------')
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
print('--------------------------')
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)
 

LDA Model:
Topic 0:
[('used', 107.43705162406972), ('feed', 96.44013859555227), ('water', 81.00054148573169), ('inches', 61.75633286883845), ('surface', 56.05016991805031), ('use', 45.497287195906395), ('cattle', 45.48211047391978), ('temperature', 43.722285795604726), ('head', 41.15538116915068), ('site', 37.76295436232687)]
Topic 1:
[('state', 623.4884364615333), ('new', 552.6193801741464), ('states', 445.08232553818283), ('year', 419.61196443117393), ('united', 362.76308689494726), ('years', 338.5524427671788), ('government', 335.6546664508169), ('program', 293.76103603469534), ('general', 293.0822034847252), ('time', 276.17526904153317)]
Topic 2:
[('mrs', 236.00377709342897), ('college', 104.86010981084831), ('home', 95.503774326025), ('new', 95.38289061908718), ('club', 89.46740612138046), ('game', 69.041667807647), ('ball', 64.3857491564888), ('miss', 60.51486590300925), ('year', 59.49125427991514), ('week', 54.7818209867559)]
Topic 3:
[('world', 325.4733015683179), ('man', 287.5

[('united', 0.2810556665874585), ('states', 0.23486975830819795), ('shall', 0.19996624745390312), ('mrs', 0.19290674776587768), ('government', 0.17597771232806955), ('school', 0.15544382805790577), ('section', 0.12799350367440762), ('act', 0.11563508844763437), ('agreement', 0.11491363678684202), ('india', 0.09910773087064638)]
Topic 8:
[('form', 0.31513231898856475), ('dictionary', 0.2926494589078216), ('information', 0.29094492695247376), ('text', 0.22132434068266732), ('year', 0.19421066604569828), ('cell', 0.1863797112427536), ('forms', 0.18410924743751939), ('tax', 0.1631409038590926), ('fiscal', 0.1545963918609125), ('said', 0.14329214493395673)]
Topic 9:
[('year', 0.2600619152995876), ('fiscal', 0.2591326774839305), ('tax', 0.1964212892534661), ('school', 0.182884038683935), ('states', 0.12170035166098062), ('like', 0.11801943739228535), ('time', 0.11209435528058151), ('years', 0.09233381187622143), ('children', 0.08253052284938324), ('child', 0.08116260022817105)]


In [60]:
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x) 

[0.00290063 0.         0.         0.         0.         0.0043872
 0.         0.         0.         0.00464711]


In [61]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

In [40]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()
 

In [41]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [42]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)


In [44]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[0.02500003 0.02500495 0.02500616 0.0250012  0.02500036 0.77496858
 0.02500113 0.02500946 0.02500368 0.02500444] 0.9999999999999999


In [45]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel
 

- https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [69]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [66]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [68]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [71]:
no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=42, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [73]:
print(len(tfidf_feature_names))
print(len(tf_feature_names))

1000
1000


In [76]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
print('---------------------------------------------')
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
people time good right did ve make say way said
Topic 1:
card video monitor cards drivers bus vga driver color memory
Topic 2:
god jesus bible christ faith believe christian christians church sin
Topic 3:
game team year games season players play hockey win league
Topic 4:
space nasa shuttle launch orbit station earth gov sci program
Topic 5:
thanks mail advance hi looking info information address help email
Topic 6:
windows file files dos window program problem using running version
Topic 7:
edu soon cs university ftp internet email article pub david
Topic 8:
key chip clipper encryption keys escrow government public algorithm nsa
Topic 9:
drive scsi drives hard disk ide floppy controller cd mac
Topic 10:
just thought ll oh tell fine got wanted mean little
Topic 11:
does know anybody mean work say doesn help exist info
Topic 12:
new 00 sale 10 price offer shipping condition 20 50
Topic 13:
like sounds looks look sound lot things bike thing really
Topic 14:
don know want let nee