In [4]:
from __future__ import print_function
from time import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [5]:
# n_samples = 2000
# n_features = 1000
n_components = 20
n_top_words = 20

In [7]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200/'])
doc = {
        'size' : 10000,
        'query': {
            'match_all' : {}
       }
   }
res = es.search(index='newsgroup', doc_type='document', body=doc,scroll='1m')

In [8]:
docs = []
for item in res['hits']['hits']:
    docs.append(item['_source']['doc_text'])

In [9]:
topic_words_dict = {}

In [10]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [11]:
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF...
done in 4.503s.


In [12]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 3.985s.



In [13]:
print("Fitting the NMF model with tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model with tf-idf features
done in 11.082s.


In [14]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: don people just com like think know time good writes ve want right make article way does new really use
Topic #1: file files program ftp image format graphics gif images data software use available server color pc directory pub display package
Topic #2: god jesus bible faith believe christ christian christians sin heaven church belief religion man say life truth rutgers christianity lord
Topic #3: drive disk hard drives floppy ide boot bios hd controller cd problem meg slave seagate rom internal pin disks mb
Topic #4: chip clipper encryption government phone law enforcement wiretap privacy algorithm security escrow crypto secure nsa phones technology encrypted administration use
Topic #5: edu article writes cs uiuc cso cc news com andrew cmu au rutgers ohio cwru david 1993 state umd acs
Topic #6: windows dos os microsoft nt ms mouse run version running apps using driver problem pc use mode memory drivers font
Topic #7: game games team ho

In [15]:
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features
done in 74.009s.


In [16]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: just writes think like time way don know want ve really right people sure thing things make say good article
Topic #1: windows use thanks using need know software like version does work program want drive help used pc ve run dos
Topic #2: god rutgers jesus say believe bible christian christians 1993 religion people true christ word church faith man christianity truth does
Topic #3: year game writes team play edu games season time players night think hockey good teams win player series red years
Topic #4: government use key law people used security state using number public keys clipper secure order make private phone chip encryption
Topic #5: writes edu article com wrote david state robert cs news use uucp says michael cc netcom steve pitt usenet mean
Topic #6: war people jews israel state world policy rights years jewish killing killed israeli fact peace political states population government muslim
Topic #7: wr

In [18]:
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features...
done in 61.442s.


In [19]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_probs = tf_vectorizer.max_features
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: god jesus jews church christ armenian bible jewish armenians christian lord turkish muslim father muslims war faith jehovah history greek
Topic #1: printer laser print hp homeopaths printers page homeopathy bernadette paper lady deskjet toner ink canon cartridges lsd laserjet fyi promo
Topic #2: edu fbi writes article koresh batf compound gas pitt davidians cult bd tear purdue inside agents banks started tank audio
Topic #3: windows use file dos program edu software available data window using information ftp files mail server version graphics image com
Topic #4: sex homosexual men homosexuality gay com cramer writes sexual article optilink homosexuals clayton marriage michael number key male paul christians
Topic #5: edu writes article drive game like just good year know ca don team card think does new games space time
Topic #6: 25 10 00 12 11 15 20 16 17 14 1993 18 13 30 28 40 24 21 27 50
Topic #7: modem port serial battery card apple radar lc ports ad

In [21]:
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit_transform(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features...
done in 62.839s.


In [22]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: god jesus jews church christ armenian bible jewish armenians christian lord turkish muslim father muslims war faith jehovah history greek
Topic #1: printer laser print hp homeopaths printers page homeopathy bernadette paper lady deskjet toner ink canon cartridges lsd laserjet fyi promo
Topic #2: edu fbi writes article koresh batf compound gas pitt davidians cult bd tear purdue inside agents banks started tank audio
Topic #3: windows use file dos program edu software available data window using information ftp files mail server version graphics image com
Topic #4: sex homosexual men homosexuality gay com cramer writes sexual article optilink homosexuals clayton marriage michael number key male paul christians
Topic #5: edu writes article drive game like just good year know ca don team card think does new games space time
Topic #6: 25 10 00 12 11 15 20 16 17 14 1993 18 13 30 28 40 24 21 27 50
Topic #7: modem port serial battery card apple radar lc ports ad

In [23]:
topic_words_prob_dict = {}

In [24]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(20):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:20] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:20])

Topic  0 :  [0.022353863512304704, 0.01857223688734366, 0.013142050477154306, 0.010164428399121406, 0.009649175237921189, 0.009481590608859402, 0.009182733881668197, 0.008810619273694325, 0.008252406188337906, 0.008071670724199356, 0.007747367529330197, 0.006530142785831521, 0.00598616983843073, 0.005964128079852212, 0.0057266400698403885, 0.005665463767801227, 0.005485822640806479, 0.005339510626071642, 0.004961413131466832, 0.004599881602261312]
Topic  1 :  [0.03236852947575337, 0.021288242805157506, 0.01807804880086793, 0.012451241723073578, 0.012067933764167522, 0.008971211430372158, 0.008264992461792662, 0.008079728702301845, 0.007961202671014796, 0.006943065801416202, 0.006798851762284424, 0.005711288147324003, 0.005547253377406439, 0.005503620984488836, 0.005326777834433815, 0.004360142646090509, 0.0040528598678197724, 0.0036398809155446066, 0.0035332036871165593, 0.0034419750690575864]
Topic  2 :  [0.03839751234427181, 0.02602336441151551, 0.01921959019789326, 0.018947718982277

In [25]:
import nltk
import enchant
d = enchant.Dict("en_US")
def predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict):
    text = nltk.tokenize.word_tokenize(doc)
    for word in text:
        if not d.check(word):
            if len(word) < 4:
                text.remove(word)
    temp_dist = nltk.FreqDist(text)
    word_dist = {}
    for word in temp_dist:
        word_dist[word] = temp_dist[word]
    for word in temp_dist:
        if word_dist[word] > 10:
            word_dist.pop(word)
    doc_topics_prob = {}
    for topic in topic_words_dict:
        score = 0
        word_list = topic_words_dict[topic]
        prob_list = topic_words_prob_dict[topic]
        for index in range(len(word_list)):
            if word_list[index] in word_dist:
                score += word_dist[word_list[index]] * prob_list[index]
        doc_topics_prob[topic] = score
    return doc_topics_prob

In [28]:
import operator
docs = []
for item in res['hits']['hits']:
    id_ = item['_source']['doc_id']
    doc = item['_source']['doc_text']
    kl_summary = item['_source']['kl_summary']
    doc_topics = predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict)
    sorted_dict = sorted(doc_topics.items(), key=operator.itemgetter(1))
    doc_topics = sorted_dict[::-1][:5]
    docs.append({
        'doc_id' : id_,
        'doc_text': doc,
        'kl_summary' : kl_summary,
        'doc_topics' : doc_topics
    })

In [29]:
docs[1]

{'doc_id': '17283',
 'doc_text': "Peter Garfiel Freeman writes:>>them. (By the way, I do not applaud the killing of _any_ human being,>>including prisoners sentenced to death by our illustrious justice department)>>>>Peace.>>-marc>Boy, you really are a stupid person.  Our justice department does>not sentence people to death.  That's up to state courts.  Again,>get a brain.Peter, I think you are ridiculous here. Stupidity is not a measure of howwell someone knows our judicial system. I guess Marc meant that he is against death penalty. But no matter what he meant, your statement not justified.Regards, ",
 'doc_topics': [(10, 0.02670317006560966),
  (2, 0.01921959019789326),
  (5, 0.014685307597847946),
  (4, 0.00932246214555893),
  (14, 0.007307856551774964)],
 'kl_summary': "  Our justice department does>not sentence people to death. Peter, I think you are ridiculous here.  Stupidity is not a measure of howwell someone knows our judicial system.  But no matter what he meant, your state

In [30]:
docs[3]

{'doc_id': '17318',
 'doc_text': '>I am considering buying Borland\'s Paradox for Windows since I>would like to use a database with Windows (I don\'t have/use>one yet) for both work/home use.  I would like to advantage>of Borland\'s "$129.95 until April 30" offer if this package>is everything that Borland claims it to be.  So, I was>wondering ... has anybody used this and/or have any opinions?>>-- Tom BelmonteIf you are interested in a program which is very easy to use, I strongly suggest Approach 2.0.  It is extremely easy to use, make reports, etc.  Iown both it and Paradox, and I almost never use Paradox.  If you need to build up a complicated application, then Paradox is the way to go.  I haveheard horror stories about the Access programming being extremely cryptic.Since you seem like you will probably be doing fairly small stuff (work/home use and you have not used a database before), I recommend Approach.  I have found only one small thing which I would like it to do more easily:

In [32]:
from elasticsearch import Elasticsearch
es = Elasticsearch('localhost')
es.indices.delete(index='newsgroup_topic_modelling', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'words':{
            'properties':{
                'topic_id': {'type': 'text', 'index': 'false'},
                'top_words': {'type': 'text', 'analyzer': 'english'},
                'word_probs':{'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
es.indices.create(index="newsgroup_topic_modelling", body=mappings_duc)

{'acknowledged': True,
 'index': 'newsgroup_topic_modelling',
 'shards_acknowledged': True}

In [33]:
topics = []
for i in range(len(topic_words_dict)):
    topics.append({
        'topic_id' : str(i),
        'top_words': topic_words_dict[i],
        'word_probs': topic_words_prob_dict[i]
    })

In [35]:
for topic in topics:
    es.index(index='newsgroup_topic_modelling', doc_type='words', body=topic)

In [38]:
es.indices.delete(index='newsgroup', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'document':{
            'properties':{
                'doc_id': {'type': 'text', 'index': 'false'},
                'doc_text': {'type': 'text', 'analyzer': 'english'},
                'kl_summary':{'type': 'text', 'analyzer': 'english'},
                'doc_topics' : {'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
es.indices.create(index="newsgroup", body=mappings_duc)

{'acknowledged': True, 'index': 'newsgroup', 'shards_acknowledged': True}

In [39]:
for doc in docs:
    es.index(index='newsgroup', doc_type='document', body=doc)

In [42]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200/'])
doc = {
        'size' : 10301,
        'query': {
            'match_all' : {}
       }
   }
res = es.search(index='duc', doc_type='document', body=doc,scroll='1m')

In [68]:
docs = []
count = 0
for item in res['hits']['hits']:
    if 'gold_summary' in item['_source']:
        docs.append(item)

KeyError: '_source'

In [67]:
docs[0]

{'_id': 'P4TLp2IBFcz1SPPjAKSs', '_type': 'document'}

In [65]:
for doc in docs:
    doc.pop('_index')
    doc.pop('_score')
    doc.pop('_source')

In [66]:
docs[0]

{'_id': 'P4TLp2IBFcz1SPPjAKSs', '_type': 'document'}

In [63]:
es.indices.delete(index='duc', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'document':{
            'properties':{
                'doc_id': {'type': 'text', 'index': 'false'},
                'doc_text': {'type': 'text', 'analyzer': 'english'},
                'gold_summary':{'type': 'text', 'analyzer': 'english'},
                'doc_topics':{'type': 'text', 'analyzer': 'english'},
                'lda_summary':{'type': 'text', 'analyzer': 'english'},
                'kl_summary':{'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
es.indices.create(index="duc", body=mappings_duc)

{'acknowledged': True, 'index': 'duc', 'shards_acknowledged': True}

In [64]:
for doc in docs:
    es.index(index='duc', doc_type='document', body=doc)

POST http://localhost:9200/duc/document [status:400 request:0.148s]


RequestError: TransportError(400, 'mapper_parsing_exception', 'Field [_index] is a metadata field and cannot be added inside a document. Use the index API request parameters.')