In [186]:
from __future__ import print_function
from time import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [187]:
# n_samples = 2000
# n_features = 1000
n_components = 20
n_top_words = 20

In [188]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200/'])
doc = {
        'size' : 301,
        'query': {
            'match_all' : {}
       }
   }
res = es.search(index='duc', doc_type='document', body=doc,scroll='1m')

In [189]:
docs = []
for item in res['hits']['hits']:
    docs.append(item['_source']['doc_text'])

In [190]:
topic_words_dict = {}

In [191]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [192]:
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF...
done in 0.388s.


In [193]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.334s.



In [194]:
print("Fitting the NMF model with tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model with tf-idf features
done in 0.921s.


In [195]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: police gates officers said brutality commission department angeles los chief officer mr city report force mayor black abortion complaints incident
Topic #1: exxon oil spill valdez cleanup said tanker alaska ship sound guard coast reef prince miles developments million crude hazelwood vessel
Topic #2: hurricane hurricanes sheets storm mph storms atlantic winds said florida tropical gilbert gray season hugo center forecasters miami damage louisiana
Topic #3: jackson dickey beach police officer long car window nbc incident tape said hawthorne glass hannon attorney duty brutality investigation hill
Topic #4: diamond beers diamonds cso botswana market carats cartel rough sales mines south world dealers cent organization year africa production african
Topic #5: eclipse sun moon solar hawaii baja total mexico viewing eclipses shadow astronomers clouds watch partial said visible box earth 11
Topic #6: johnson lewis ben francis steroids said drug

In [196]:
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features
done in 6.607s.


In [197]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: said year years time people states new say used state national long just says house president federal government week make
Topic #1: exxon oil spill guard valdez environmental tex gallons gain happen 24 face disaster handling governor prince gov fishing hesaid headed
Topic #2: storm winds hurricane season recorded pressure tropical said people storms damage weather florida west june south destroyed early texas mph
Topic #3: medical really said say management problem monday dr person did disease hospital city physician months called according like researchers mobile
Topic #4: known sheep similar jakob disease year south revealed ministry scientists market organisation opening price production medical spongiform owned transmission soviet
Topic #5: sun eclipses dark event nature shadow watch electricity directly danger eclipse solar cut earth coast total time didn big moon
Topic #6: world olympics steroid rome run o

In [198]:
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features...
done in 1.942s.


In [199]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_probs = tf_vectorizer.max_features
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: said eclipse people slovenia thomas time sun hawaii cjd mexico dr mr country don look called years police disease moon
Topic #1: diabetes hispanics insulin disease hispanic exercise americans diabetics diet latinos said type stern diabetic obesity defronzo foods body latino antonio
Topic #2: diamond beers mr nafta diamonds trade says market year cent world slovenia south said countries cartel sales million clinton prices
Topic #3: tunnel french british link rail eurotunnel britain project london billion channel speed europe workers chunnel trains paris machines tunnels england
Topic #4: kelley mr says said thomas years year officers 000 time police don long group commission new eclipse run gandhi government
Topic #5: said police tuberculosis people year limits years state term tb infected smith oil states officials aids health cases new began
Topic #6: said gun thomas nra right disease people states census government bse arms amendment court senate milit

In [200]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit_transform(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 2.027s.


In [201]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: said eclipse people slovenia thomas time sun hawaii cjd mexico dr mr country don look called years police disease moon
Topic #1: diabetes hispanics insulin disease hispanic exercise americans diabetics diet latinos said type stern diabetic obesity defronzo foods body latino antonio
Topic #2: diamond beers mr nafta diamonds trade says market year cent world slovenia south said countries cartel sales million clinton prices
Topic #3: tunnel french british link rail eurotunnel britain project london billion channel speed europe workers chunnel trains paris machines tunnels england
Topic #4: kelley mr says said thomas years year officers 000 time police don long group commission new eclipse run gandhi government
Topic #5: said police tuberculosis people year limits years state term tb infected smith oil states officials aids health cases new began
Topic #6: said gun thomas nra right disease people states census government bse arms amendment court senate milit

In [202]:
topic_words_prob_dict = {}

In [203]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(20):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:20] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:20])

Topic  0 :  [0.000343046958355125, 0.0003254275529519686, 0.0002756204765663789, 0.00021754527289751937, 0.00020176833389127128, 0.0001998111456607119, 0.00019361192674071962, 0.00018929062853456893, 0.00018250627397268067, 0.00018209939240124064, 0.00018047652523178143, 0.00017999769031914217, 0.00017928945102783106, 0.00017795748008589255, 0.00017771191003832063, 0.00017627653191918608, 0.00017620247230822547, 0.000175381246216393, 0.00017346527776745054, 0.0001695101790274912]
Topic  1 :  [0.00836159629890997, 0.0015007988966807848, 0.0014950829975262888, 0.0013577947153469076, 0.0011287100714473045, 0.0010896431473966194, 0.001078518287404718, 0.001031721501528272, 0.0009306624458398775, 0.0009167341770571309, 0.0009139208337136654, 0.0008321894308103967, 0.0007651834748246153, 0.0007346065789631983, 0.0007157864136470873, 0.0006828943484439319, 0.0005986922034353947, 0.0005588433235855838, 0.0005494343105417956, 0.0005475969551040049]
Topic  2 :  [0.008866013177845605, 0.008677198

Topic  19 :  [0.0006408540111230835, 0.00033097140882778315, 0.0002736414863018299, 0.00024034606118580415, 0.00022151343807136743, 0.00021372145260415687, 0.00021008241551017157, 0.00020990480661902671, 0.00020355126814880563, 0.00020099053802078484, 0.0001986324406134467, 0.00019818021566487065, 0.00019742522412968455, 0.00019563907200897666, 0.00019432225656746302, 0.00019182538026660923, 0.00019127357522845148, 0.00018934307203361347, 0.00018753232519812962, 0.00018717352336281902]


In [204]:
import nltk
import enchant
d = enchant.Dict("en_US")
def predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict):
    text = nltk.tokenize.word_tokenize(doc)
    for word in text:
        if not d.check(word):
            if len(word) < 4:
                text.remove(word)
    temp_dist = nltk.FreqDist(text)
    word_dist = {}
    for word in temp_dist:
        word_dist[word] = temp_dist[word]
    for word in temp_dist:
        if word_dist[word] > 10:
            word_dist.pop(word)
    doc_topics_prob = {}
    for topic in topic_words_dict:
        score = 0
        word_list = topic_words_dict[topic]
        prob_list = topic_words_prob_dict[topic]
        for index in range(len(word_list)):
            if word_list[index] in word_dist:
                score += word_dist[word_list[index]] * prob_list[index]
        doc_topics_prob[topic] = score
    return doc_topics_prob

In [205]:
import operator
docs = []
for item in res['hits']['hits']:
    id_ = item['_source']['doc_id']
    doc = item['_source']['doc_text']
    gold_summary = item['_source']['gold_summary']
    kl_summary = item['_source']['kl_summary']
    doc_topics = predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict)
    sorted_dict = sorted(doc_topics.items(), key=operator.itemgetter(1))
    doc_topics = sorted_dict[::-1][:5]
    docs.append({
        'doc_id' : id_,
        'doc_text': doc,
        'gold_summary': gold_summary,
        'kl_summary' : kl_summary,
        'doc_topics' : doc_topics
    })

In [206]:
docs[1]

{'doc_id': 'LA120290-0163 ',
 'doc_text': 'Cheers erupted Saturday on both sides of the English Channel when British and French workers digging the Channel Tunnel finally met after knocking out a passage large enough to walk through and shake hands. </P><P>"Today, for the first time, men can cross the channel underground," French President Francois Mitterrand said. "What a brilliant sign of the vitality of our two countries." </P><P>The breakthrough came in a 6-foot-tall service tunnel that will be used to maintain two rail tunnels still being bored. It marked a symbolic milestone in Europe\'s biggest engineering project. </P><P>Using jackhammers, Graham Fagg, 42, of Dover, England, and Philippe Cozette, 37, of Calais, France, knocked out the last foot of chalk to link up the British and French sides of the tunnel -- which has been dubbed a "chunnel." </P><P>The smiling pair then clasped hands, embraced and exchanged their national flags. Workers in overalls looked on and applauded. </

In [208]:
docs[4]

{'doc_id': 'T923-5089',
 'doc_text': "RE are growing signs that Hurricane Andrew, unwelcome as it was for thedevastated inhabitants of Florida and Louisiana, may in the end do no harmto the re-election campaign of President George Bush.After a faltering and heavily criticised initial response to the disaster,both the president and his administration seem finally to be gettingassistance to those rendered homeless and to businesses and farms that havebeen destroyed.In the process, Mr Bush has been able to call on the power of incumbency,the one asset denied his presidential rival, Mr Bill Clinton, who is tovisit Florida today. This was brought home graphically by the president'sannouncement that Homestead Air Force base in Florida - a major localemployer virtually destroyed by Andrew  - would be rebuilt.His poignant and brief address to the nation on Tuesday night, committingthe government to pay the emergency relief costs and calling on Americans tocontribute to the American Red Cross, 

In [166]:
from elasticsearch import Elasticsearch
es = Elasticsearch('localhost')
es.indices.delete(index='duc_topic_modelling', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'words':{
            'properties':{
                'topic_id': {'type': 'text', 'index': 'false'},
                'top_words': {'type': 'text', 'analyzer': 'english'},
                'word_probs':{'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
client.indices.create(index="duc_topic_modelling", body=mappings_duc)

{'acknowledged': True,
 'index': 'duc_topic_modelling',
 'shards_acknowledged': True}

In [209]:
topics = []
for i in range(len(topic_words_dict)):
    topics.append({
        'topic_id' : str(i),
        'top_words': topic_words_dict[i],
        'word_probs': topic_words_prob_dict[i]
    })

In [210]:
for topic in topics:
    client.index(index='duc_topic_modelling', doc_type='words', body=topic)

In [211]:
es.indices.delete(index='duc', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'document':{
            'properties':{
                'doc_id': {'type': 'text', 'index': 'false'},
                'doc_text': {'type': 'text', 'analyzer': 'english'},
                'gold_summary':{'type': 'text', 'analyzer': 'english'},
                'kl_summary':{'type': 'text', 'analyzer': 'english'},
                'doc_topics' : {'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
es.indices.create(index="duc", body=mappings_duc)

{'acknowledged': True, 'index': 'duc', 'shards_acknowledged': True}

In [212]:
for doc in docs:
    es.index(index='duc', doc_type='document', body=doc)