In [29]:
from __future__ import print_function
from time import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [30]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [31]:
topic_words_dict = {}

In [32]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [33]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 2.215s.


In [34]:
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF...
done in 0.602s.


In [35]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.492s.



In [8]:
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.363s.


In [9]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp mail cc pub article information hope program mac email home contact blood
Topic #6: file problem files format win sound ftp pub read save site help image available create copy running memory self version
Topic #7: game team games year win play season playe

In [10]:
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=2000 and n_features=1000...
done in 2.270s.


In [11]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: people just like time don say really know way things make think right said did want ve probably work years
Topic #1: windows thanks using help need hi work know use looking mail software does used pc video available running info advance
Topic #2: god does true read know say believe subject says religion mean question point jesus people book christian mind understand matter
Topic #3: thanks know like interested mail just want new send edu list does bike thing email reply post wondering hear heard
Topic #4: time new 10 year sale old offer 20 16 15 great 30 weeks good test model condition 11 14 power
Topic #5: use number com government new university data states information talk phone right including security provide control following long used research
Topic #6: edu try file soon remember problem com program hope mike space article wrong library short include win little couldn sun
Topic #7: year world team game pla

In [36]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 5.020s.


In [13]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_probs = tf_vectorizer.max_features
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1: don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2: christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3: drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4: hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5: god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6: 55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7: car year just cars new engine like bike good oil insurance better tires 000 thing speed model brake driving performance
Topic #8: people said

In [14]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit_transform(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 6.977s.


In [24]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1: don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2: christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3: drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4: hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5: god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6: 55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7: car year just cars new engine like bike good oil insurance better tires 000 thing speed model brake driving performance
Topic #8: people said

In [29]:
topic_words_prob_dict = {}

In [34]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(10):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:20] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:20])

Topic  0 :  [0.05201295519414298, 0.023301467982438023, 0.018757677200607382, 0.017853034313902293, 0.01595098191416809, 0.01421039583681441, 0.013452740553974264, 0.011952686203526814, 0.011686121720430662, 0.010583275527159389, 0.009855197508471327, 0.009576367273422747, 0.009509668275976298, 0.009114667086822948, 0.008586300586097065, 0.007287622462962518, 0.007127524628490003, 0.007015770896936109, 0.006898763186196211, 0.006855242806574529]
Topic  1 :  [0.022526318730000276, 0.019200694844516055, 0.016172949248556064, 0.014221316124639125, 0.013672203283395343, 0.012894625646772535, 0.012293395990878996, 0.010477578387284935, 0.010473732540938498, 0.010437073090386372, 0.010000710949059643, 0.009012636990721816, 0.008548999170659056, 0.008421857190456418, 0.007740498647604117, 0.007739467458600744, 0.007157696679956578, 0.007051879181552583, 0.006808575881926617, 0.006753129655481801]
Topic  2 :  [0.03246905623669139, 0.030531335701191153, 0.023140364283488854, 0.0193858904471216,

In [36]:
from elasticsearch import Elasticsearch
client = Elasticsearch('localhost')
mappings_duc = {
    'mappings':{
        'words':{
            'properties':{
                'topic_id': {'type': 'text', 'index': 'false'},
                'top_words': {'type': 'text', 'analyzer': 'english'},
                'word_probs':{'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
client.indices.create(index="newsgroup_topic_modelling", body=mappings_duc)

{'acknowledged': True,
 'index': 'newsgroup_topic_modelling',
 'shards_acknowledged': True}

In [42]:
topics = []
for i in range(len(topic_words_dict)):
    topics.append({
        'topic_id' : str(i),
        'top_words': topic_words_dict[i],
        'word_probs': topic_words_prob_dict[i]
    })

In [44]:
for topic in topics:
    client.index(index='newsgroup_topic_modelling', doc_type='words', body=topic)

In [12]:
nmf.inverse_transform

1000

In [185]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200/'])
doc = {
        'size': 10000,
        'query': {
            'match_all' : {}
       }
   }
res = es.search(index='newsgroup', doc_type='document', body=doc)
docs = []
for item in res['hits']['hits']:
    docs.append(item['_source']['doc_text'])

In [186]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(docs)

Extracting tf features for LDA...


In [187]:
transformed_docs = tf_vectorizer.inverse_transform(tf)

In [188]:
documents = []
for word_list in transformed_docs:
    text = ""
    for word in word_list:
        text += word + " "
    documents.append(text)

In [189]:
len(documents)

10000

In [190]:
documents[0]

'information material reference manual user ths couldn colours height width stored data mean like look format bitmap windows does exactly wrote tcd unix2 robertsa roberts andrew '

In [191]:
from gensim import corpora, models, similarities
from itertools import chain
from nltk.corpus import stopwords
# remove common words and tokenize
# stoplist = set('for a of the and to in is that i have this not you are was it not be with have'.split())
# stoplist = set(stopwords.words('english'))
# texts = [[word for word in document.lower().split() if word not in stoplist]
#          for document in documents]
texts = [[word for word in document.lower().split()]
         for document in documents]
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]

# Create Dictionary.
id2word = corpora.Dictionary(texts)
# Creates the Bag of Word corpus.
mm = [id2word.doc2bow(text) for text in texts]

In [192]:
# Trains the LDA models.
lda = models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10)

In [110]:
lda_corpus = lda[mm]

In [172]:
len(lda_corpus)

301

In [77]:
lda_corpus[0]

[(6, 0.014916044), (17, 0.97508794)]

In [180]:
lda.print_topics(num_topics=10, num_words=60)

[(0,
  '0.003*"said" + 0.003*"year" + 0.002*"time" + 0.002*"people" + 0.002*"years" + 0.002*"000" + 0.002*"government" + 0.002*"state" + 0.002*"like" + 0.001*"just" + 0.001*"according" + 0.001*"say" + 0.001*"states" + 0.001*"20" + 0.001*"called" + 0.001*"did" + 0.001*"officials" + 0.001*"house" + 0.001*"new" + 0.001*"today" + 0.001*"times" + 0.001*"10" + 0.001*"way" + 0.001*"long" + 0.001*"50" + 0.001*"use" + 0.001*"home" + 0.001*"president" + 0.001*"going" + 0.001*"american" + 0.001*"number" + 0.001*"day" + 0.001*"world" + 0.001*"mr" + 0.001*"100" + 0.001*"total" + 0.001*"expected" + 0.001*"far" + 0.001*"million" + 0.001*"based" + 0.001*"taken" + 0.001*"week" + 0.001*"record" + 0.001*"away" + 0.001*"near" + 0.001*"general" + 0.001*"old" + 0.001*"nation" + 0.001*"members" + 0.001*"national" + 0.001*"high" + 0.001*"second" + 0.001*"miles" + 0.001*"public" + 0.001*"don" + 0.001*"right" + 0.001*"come" + 0.001*"30" + 0.001*"took" + 0.001*"department"'),
 (1,
  '0.003*"said" + 0.003*"new" +

In [181]:
lda.show_topics(num_words=20)

[(0,
  '0.003*"said" + 0.003*"year" + 0.002*"time" + 0.002*"people" + 0.002*"years" + 0.002*"000" + 0.002*"government" + 0.002*"state" + 0.002*"like" + 0.001*"just" + 0.001*"according" + 0.001*"say" + 0.001*"states" + 0.001*"20" + 0.001*"called" + 0.001*"did" + 0.001*"officials" + 0.001*"house" + 0.001*"new" + 0.001*"today"'),
 (1,
  '0.003*"said" + 0.003*"new" + 0.002*"people" + 0.002*"year" + 0.002*"government" + 0.002*"way" + 0.002*"time" + 0.002*"officials" + 0.002*"state" + 0.002*"000" + 0.002*"years" + 0.002*"second" + 0.002*"called" + 0.002*"going" + 0.001*"like" + 0.001*"10" + 0.001*"week" + 0.001*"just" + 0.001*"including" + 0.001*"say"'),
 (2,
  '0.005*"said" + 0.003*"year" + 0.002*"time" + 0.002*"new" + 0.002*"years" + 0.002*"people" + 0.002*"make" + 0.002*"national" + 0.002*"000" + 0.001*"ago" + 0.001*"just" + 0.001*"world" + 0.001*"according" + 0.001*"officials" + 0.001*"states" + 0.001*"like" + 0.001*"near" + 0.001*"government" + 0.001*"expected" + 0.001*"20"'),
 (3,
  '0