In [4]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

500


In [12]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
 
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])



(500, 10)
(500, 10)
(500, 10)
[  1.05602040e-04   1.05611148e-04   1.05613841e-04   1.05623638e-04
   1.05597169e-04   9.50659895e-01   1.05603435e-04   1.05618740e-04
   4.84952384e-02   1.05596741e-04]
[ 0.          0.          2.11789723  0.07698229  0.          0.5432858
  1.06968036  0.          0.          0.24611252]
[ 23.30684384   1.59476557  21.83191516  -0.07619026   0.83259805
  10.90495994   3.72561712  -2.06031853   1.95402218 -13.4204474 ]


In [19]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('dallas', 22.446841108219242), ('stock', 15.85003715849847), ('gin', 15.464746696232998), ('cotton', 11.159940897341217), ('shares', 10.268573770036749), ('morton', 9.5504064660194281), ('equipment', 9.2355646727871221), ('foods', 7.7063597768000918), ('sales', 7.4410145065858808), ('machinery', 7.2141071757856716)]
Topic 1:
[('life', 296.73985117175454), ('man', 288.50507871531761), ('world', 246.12794635508544), ('new', 214.51672789015274), ('god', 204.1642624435186), ('human', 158.77382288105389), ('great', 158.20718648829242), ('time', 150.13542557225725), ('work', 140.5331008795788), ('music', 133.76757326038776)]
Topic 2:
[('clay', 60.087619061658692), ('game', 58.052673118696433), ('ball', 44.408778629918743), ('marine', 42.359452907784977), ('baseball', 42.185548490183066), ('palmer', 39.015636006004414), ('seeds', 36.111953917426824), ('player', 35.464370475326895), ('league', 35.279886877871149), ('mold', 32.399720426645615)]
Topic 3:
[('church', 116.756

[('state', 0.43910609904843556), ('mrs', 0.25944250358686272), ('form', 0.2173987055832606), ('dictionary', 0.19083121168334471), ('information', 0.16642077303620226), ('text', 0.14264545835423334), ('forms', 0.12700144698355526), ('federal', 0.12658002754001038), ('cell', 0.12572750042141226), ('man', 0.10526883492583305)]
Topic 7:
[('united', 0.2858064458895298), ('states', 0.23536727842087798), ('shall', 0.20585632666150985), ('mrs', 0.18784318731507232), ('government', 0.17422357370555425), ('school', 0.15192624945366442), ('section', 0.13050134455323287), ('act', 0.11612660697691987), ('agreement', 0.11512085110803097), ('information', 0.10592045213085732)]
Topic 8:
[('form', 0.31589523838168915), ('dictionary', 0.2986806622750437), ('information', 0.29565453513715784), ('text', 0.22601874886552542), ('cell', 0.19027635444136454), ('forms', 0.18675876106442021), ('year', 0.17531042843841382), ('tax', 0.14521371690777404), ('said', 0.13783905821017398), ('list', 0.13428212902462092

In [18]:
vectorizer.transform([text])

<1x10625 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [17]:
text = "The working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[ 0.00310816  0.          0.          0.          0.          0.00499434
  0.          0.          0.          0.00172405]


In [9]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

**SVD**

In [21]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [22]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [23]:
data_vectorized

<500x10625 sparse matrix of type '<class 'numpy.int64'>'
	with 241921 stored elements in Compressed Sparse Row format>

In [30]:
svd.fit(data_vectorized)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)

In [33]:
svd = TruncatedSVD(n_components=100)

In [34]:
svd.fit(data_vectorized)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=None, tol=0.0)

In [None]:
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
plt.plot(svd.singular_values_,'*b')