In [7]:
from nltk.corpus import brown
 
data = []

for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

In [36]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
# Have a look at how the 20th document looks like: [(word_id, count), ...]
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)



In [12]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
('Topic #0:', u'0.006*"one" + 0.005*"would" + 0.004*"could" + 0.003*"new" + 0.003*"said" + 0.003*"man" + 0.003*"time" + 0.003*"like" + 0.003*"may" + 0.002*"back"')
('Topic #1:', u'0.006*"one" + 0.005*"would" + 0.003*"new" + 0.003*"said" + 0.003*"first" + 0.002*"man" + 0.002*"time" + 0.002*"may" + 0.002*"even" + 0.002*"two"')
('Topic #2:', u'0.006*"would" + 0.005*"one" + 0.003*"new" + 0.003*"said" + 0.003*"time" + 0.003*"two" + 0.002*"first" + 0.002*"made" + 0.002*"could" + 0.002*"state"')
('Topic #3:', u'0.009*"one" + 0.005*"would" + 0.005*"could" + 0.005*"said" + 0.003*"time" + 0.003*"two" + 0.003*"like" + 0.003*"first" + 0.002*"man" + 0.002*"made"')
('Topic #4:', u'0.006*"one" + 0.004*"would" + 0.004*"new" + 0.003*"first" + 0.003*"said" + 0.003*"two" + 0.003*"may" + 0.003*"man" + 0.003*"could" + 0.002*"time"')
('Topic #5:', u'0.005*"would" + 0.005*"one" + 0.005*"said" + 0.004*"new" + 0.003*"time" + 0.003*"even" + 0.003*"may" + 0.003*"like" + 0.003*"could" + 0.002*"first"')

In [39]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
lda_model.get_document_topics(bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)

[(0, 0.020004706),
 (1, 0.020004356),
 (2, 0.020005045),
 (3, 0.020005176),
 (4, 0.020004956),
 (5, 0.81995255),
 (6, 0.020005338),
 (7, 0.020005317),
 (8, 0.0200053),
 (9, 0.02000726)]

In [19]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])


[(112, 0.99762845), (43, 0.9976125), (152, 0.9974847), (16, 0.99733174), (66, 0.99733174), (138, 0.99733174), (227, 0.99733174), (283, 0.99733174), (313, 0.99733174), (328, 0.99733174)]
Scientists say that the world and everything in it are based on mathematics . Without math the men who are continually seeking the causes of and the reasons for the many things that make the world go 'round would not have any means of analyzing , standardizing , and communicating the things they discover and learn . Math and the formulas that allow it to be applied to different problems are , therefore , essential to any scientific endeavor . Hot rodding is a science . It's not a science as involved as determining what makes the earth rotate on its axis or building a rocket or putting a satellite into orbit but it is , nevertheless , a science . But because science is based on mathematics doesn't mean that a hot rodder must necessarily be a mathematician . A guy can be an active and successful hot rodde

In [22]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [31]:
from sklearn import decomposition
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
    
svd = decomposition.TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [32]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)