In [1]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)

500


In [2]:
print(data[0])



In [3]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

In [4]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [5]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

In [6]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(len(dictionary),len(tokenized_data))
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

44940 500


In [7]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [8]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 5))
 
print("=" * 20)

LDA Model:
Topic #0: 0.005*"said" + 0.005*"one" + 0.004*"would" + 0.004*"could" + 0.003*"two"
Topic #1: 0.006*"would" + 0.006*"one" + 0.004*"said" + 0.004*"could" + 0.003*"man"
Topic #2: 0.006*"would" + 0.005*"one" + 0.004*"said" + 0.004*"time" + 0.003*"could"
Topic #3: 0.006*"one" + 0.004*"new" + 0.003*"would" + 0.003*"said" + 0.002*"could"
Topic #4: 0.006*"one" + 0.005*"would" + 0.003*"new" + 0.003*"time" + 0.003*"said"
Topic #5: 0.006*"one" + 0.005*"would" + 0.004*"new" + 0.003*"two" + 0.003*"could"
Topic #6: 0.006*"would" + 0.005*"one" + 0.004*"may" + 0.004*"could" + 0.003*"said"
Topic #7: 0.007*"would" + 0.006*"one" + 0.003*"like" + 0.003*"could" + 0.003*"said"
Topic #8: 0.006*"one" + 0.005*"would" + 0.004*"man" + 0.004*"could" + 0.003*"said"
Topic #9: 0.008*"one" + 0.004*"said" + 0.004*"would" + 0.003*"new" + 0.003*"also"


In [9]:
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 5))

Topic #0: 0.308*"one" + 0.280*"would" + 0.202*"said" + 0.175*"could" + 0.146*"time"
Topic #1: -0.294*"said" + 0.219*"may" + 0.179*"state" + -0.176*"could" + -0.153*"would"
Topic #2: 0.340*"said" + 0.338*"state" + -0.228*"one" + 0.190*"states" + 0.161*"year"
Topic #3: -0.264*"new" + -0.256*"mrs." + 0.155*"feed" + 0.151*"per" + -0.149*"world"
Topic #4: 0.510*"mrs." + -0.235*"would" + -0.191*"states" + -0.153*"united" + -0.130*"could"
Topic #5: -0.377*"feed" + 0.370*"would" + -0.273*"per" + -0.241*"state" + -0.128*"daily"
Topic #6: -0.271*"feed" + 0.258*"mrs." + -0.222*"per" + -0.177*"school" + -0.172*"would"
Topic #7: 0.393*"mrs." + 0.291*"would" + -0.249*"state" + -0.226*"said" + 0.225*"feed"
Topic #8: -0.382*"state" + -0.270*"mrs." + -0.264*"would" + 0.177*"new" + 0.165*"business"
Topic #9: -0.202*"may" + 0.190*"new" + -0.189*"mrs." + -0.184*"shall" + -0.175*"said"


In [10]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
 
print(lsi_model[bow])
# [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077),
#(9, 0.025989149894888153)]
print('#'*20)
print(lda_model[bow])
# [(0, 0.020005183), (1, 0.020005869), (2, 0.02000626), (3, 0.020005472), (4, 0.020009108), (5, 0.020005926), (6, 0.81994385), (7, 0.020006068), (8, 0.020006327), (9, 0.020005994)]
 

[(0, 0.09161086451903701), (1, -0.00870685428943889), (2, -0.015628236372563335), (3, -0.04142604253218479), (4, 0.014671632822258154), (5, -0.011914324110200266), (6, -0.030632569110583374), (7, -0.019370000836451674), (8, 0.055942333546748366), (9, 0.02260175003945495)]
####################
[(0, 0.020006154), (1, 0.020006299), (2, 0.020005714), (3, 0.020005222), (4, 0.020005837), (5, 0.020006591), (6, 0.020006604), (7, 0.81994134), (8, 0.02000961), (9, 0.020006612)]


In [11]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])

  if np.issubdtype(vec.dtype, np.int):


[(443, 0.9979272), (403, 0.99777824), (169, 0.9976353), (482, 0.9976283), (437, 0.99761105), (455, 0.99755853), (354, 0.9975563), (270, 0.99753875), (20, 0.99738806), (123, 0.997332)]


In [12]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

vectorizer = CountVectorizer(min_df=5, max_df=0.9,stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

In [13]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
text_font_size="8pt", text_color="#66666",
source=source, text_align='center')
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [14]:
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())



[0.02501512 0.02500784 0.02500059 0.02500952 0.77495335 0.02500007
 0.02500768 0.02500002 0.02500003 0.02500578] 1.0


In [16]:
import pyLDAvis.sklearn

In [17]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
