In [1]:
from __future__ import print_function
from time import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [2]:
# n_samples = 2000
# n_features = 1000
n_components = 10
n_top_words = 20

In [3]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200/'])
doc = {
        'size' : 301,
        'query': {
            'match_all' : {}
       }
   }
res = es.search(index='duc', doc_type='document', body=doc,scroll='1m')

In [4]:
docs = []
for item in res['hits']['hits']:
    docs.append(item['_source']['doc_text'])

In [5]:
topic_words_dict = {}

In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [7]:
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF...
done in 0.353s.


In [8]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.313s.



In [9]:
print("Fitting the NMF model with tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model with tf-idf features
done in 0.653s.


In [10]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: welfare reform governors benefits recipients work families programs clinton president plan poverty spending cent children training congress programmes democrats dependency
Topic #1: exxon oil spill valdez cleanup tanker said alaska ship sound guard coast reef prince developments miles million crude hazelwood vessel
Topic #2: hurricane hurricanes sheets storm mph storms atlantic winds florida tropical gilbert said gray hugo season center forecasters miami damage coast
Topic #3: taylor pneumonia miss hospital doctors actress said st hospitalized bacterial john sam condition chen elizabeth health addiction viral weeks publicist
Topic #4: diamond beers diamonds cso botswana market carats cartel rough sales mines south world dealers cent organization africa production year african
Topic #5: eclipse sun moon solar hawaii baja total mexico viewing eclipses shadow astronomers clouds watch partial visible box earth 11 said
Topic #6: johnson lewis

In [11]:
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features
done in 3.124s.


In [12]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: year new state years people said time like just say states members president mr public government making department washington did
Topic #1: spokesman oil coast environmental valdez size guard spill fouled exxon related sea 24 tiny foot million spokeswoman expected accident sunday
Topic #2: south texas winds said storm center national weather average damage hurricane pressure storms 30 high year atlantic west miles hit
Topic #3: said hospital people health officer today cause 10 infection john flight noon old st treated crash 20 doctors near center
Topic #4: cattle year countries world cases causes central dollars market nation british years million likely months country new consolidated trade case
Topic #5: pacific people discrimination education supreme recorded time sun right view watch scientific rays mexico thousands years danforth howard peaked pressed
Topic #6: world won games women olympic marathon gold s

In [13]:
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_components=n_components,
                                learning_method='online')
#                                 learning_offset=50.,
#                                 random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features...
done in 2.661s.


In [14]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_probs = tf_vectorizer.max_features
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: cellrule said tablecell exxon bank world oil year million drought chj cvj valdez spill 87 30 exposure reserve billion new
Topic #1: earthquake earthquakes quake magnitude richter scale survey fault chile prediction measuring japan measured geological considerable significant area buildings seismic quakes
Topic #2: johnson said police pain lewis ben use protesters world rescue compliance angeles drugs operation demonstrators los francis abortion gates steroids
Topic #3: mr path shining said term people president war state party slovenia guzman nafta welfare government limits new congress committee central
Topic #4: said police people crash plane gun city force right air officers department state engine control nra states members federal chief
Topic #5: thomas court said box black rights mr civil abortion law supreme right action senate nomination clarence man years views eclipse
Topic #6: diabetes said hispanic beef disease hispanics british diabetics exe

In [15]:
topic_words_prob_dict = {}

In [17]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(10):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:20] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:20])

Topic  0 :  [0.01761922060995288, 0.015474508293340725, 0.013914765076423145, 0.012185872404435995, 0.011157381527114103, 0.010754785583599512, 0.010533267683335151, 0.007063857708885361, 0.006935000960160412, 0.006894463726483822, 0.006602707241231395, 0.006568830910630214, 0.006310118892230178, 0.006287907576096999, 0.005988541316979188, 0.005359424568953088, 0.00513579618091846, 0.004672788761448498, 0.004665339772060951, 0.004300038686292435]
Topic  1 :  [0.04904104860096852, 0.016624424164567532, 0.012041880949448545, 0.009821324424416512, 0.008073232096905861, 0.007989204932304095, 0.006125196069768168, 0.00557903907174001, 0.0055682998836173265, 0.005426937888197818, 0.005340723815686507, 0.005080638730979733, 0.004017592352717476, 0.003955385662883691, 0.003866514635808868, 0.0036616315598421334, 0.0033905050753380785, 0.003333274197094592, 0.002886457101164749, 0.0027678454548448854]
Topic  2 :  [0.02351993459753195, 0.020983314802152302, 0.011404503058177246, 0.00822736442034

In [23]:
import nltk
import enchant
d = enchant.Dict("en_US")
def predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict):
    text = nltk.tokenize.word_tokenize(doc)
    for word in text:
        if not d.check(word):
            if len(word) < 4:
                text.remove(word)
    temp_dist = nltk.FreqDist(text)
    word_dist = {}
    for word in temp_dist:
        word_dist[word] = temp_dist[word]
    for word in temp_dist:
        if word_dist[word] > 10:
            word_dist.pop(word)
    doc_topics_prob = {}
    for topic in topic_words_dict:
        score = 0
        word_list = topic_words_dict[topic]
        prob_list = topic_words_prob_dict[topic]
        for index in range(len(word_list)):
            if word_list[index] in word_dist:
                score += word_dist[word_list[index]] * prob_list[index]
        doc_topics_prob[topic] = score
    return doc_topics_prob

In [24]:
import operator
docs = []
for item in res['hits']['hits']:
    id_ = item['_source']['doc_id']
    doc = item['_source']['doc_text']
    gold_summary = item['_source']['gold_summary']
#     kl_summary = item['_source']['kl_summary']
    doc_topics = predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict)
    sorted_dict = sorted(doc_topics.items(), key=operator.itemgetter(1))
    doc_topics = sorted_dict[::-1][:5]
    docs.append({
        'doc_id' : id_,
        'doc_text': doc,
        'gold_summary': gold_summary,
#         'kl_summary' : kl_summary,
        'doc_topics' : doc_topics
    })

In [28]:
docs[3]

{'doc_id': 'AP901013-0046 ',
 'doc_text': "Egypt honored its slain parliament speakerand four security men today with a state funeral led by agrim-looking President Hosni Mubarak.   The government said Iraqi agents or Egyptian Moslemfundamentalists were to blame for the assassination Friday of itssecond-highest official, Rifaat el-Mahgoub. He was the firstEgyptian politician assassinated since Islamic extremists shotPresident Anwar Sadat at a military parade nine years ago.   Four assassins riding two motorbikes killed el-Mahgoub in a cardriving by a luxury hotel by the Nile.   The death toll from the attack rose to six today with the deathof the speaker's chauffeur. Doctors in a Cairo hospital said thedriver suffered bullet wounds in the stomach, back and arm.   Hassan Abu-Basha, a former police minister, told the Caironewspaper Al-Ahram he believed el-Mahgoub's slaying was the work ofIraqi agents. He said the perpetrators possibly belonged to thePalestinian extremist faction led by A

In [26]:
docs[4]

{'doc_id': 'T941-1547',
 'doc_text': " German government yesterday announced the launch of a new researchproject to examine whether the cattle disease bovine spongiformencephalopathy (BSE) can be transmitted to human beings.The initiative comes as the country is pushing for a European Union ban onBritish beef imports, arguing that there is still no conclusive evidencethat the disease cannot affect humans.Seven German universities and research institutes will be sponsored by thecountry's research and technology ministry to examine possible connectionsbetween the origins of BSE and two other diseases, Creutzfeldt Jakob diseaseand Gerstmann Straussler syndrome, which very rarely affect humans.Several German scientists have expressed concern that BSE - popularly knownas 'mad cow disease' because of the way it debilitates the brains of cattle -may be transmissible to humans who eat contaminated beef or take medicinesmade with ingredients from contaminated animals.'The danger that BSE can be

In [19]:
from elasticsearch import Elasticsearch
es = Elasticsearch('localhost')
es.indices.delete(index='duc_topic_modelling', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'words':{
            'properties':{
                'topic_id': {'type': 'text', 'index': 'false'},
                'top_words': {'type': 'text', 'analyzer': 'english'},
                'word_probs':{'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
es.indices.create(index="duc_topic_modelling", body=mappings_duc)

{'acknowledged': True,
 'index': 'duc_topic_modelling',
 'shards_acknowledged': True}

In [20]:
topics = []
for i in range(len(topic_words_dict)):
    topics.append({
        'topic_id' : str(i),
        'top_words': topic_words_dict[i],
        'word_probs': topic_words_prob_dict[i]
    })

In [22]:
for topic in topics:
    es.index(index='duc_topic_modelling', doc_type='words', body=topic)

In [29]:
es.indices.delete(index='duc', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'document':{
            'properties':{
                'doc_id': {'type': 'text', 'index': 'false'},
                'doc_text': {'type': 'text', 'analyzer': 'english'},
                'gold_summary':{'type': 'text', 'analyzer': 'english'},
#                 'kl_summary':{'type': 'text', 'analyzer': 'english'},
                'doc_topics' : {'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
es.indices.create(index="duc", body=mappings_duc)

{'acknowledged': True, 'index': 'duc', 'shards_acknowledged': True}

In [30]:
for doc in docs:
    es.index(index='duc', doc_type='document', body=doc)