In [1]:
from __future__ import print_function
from time import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [78]:
n_samples = 2000
n_features = 1000
n_components = 20
n_top_words = 20

In [79]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200/'])
doc = {
        'size' : 301,
        'query': {
            'match_all' : {}
       }
   }
res = es.search(index='duc', doc_type='document', body=doc,scroll='1m')

In [80]:
docs = []
for item in res['hits']['hits']:
    docs.append(item['_source']['doc_text'])

In [81]:
topic_words_dict = {}

In [82]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [83]:
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF...
done in 0.448s.


In [84]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.348s.



In [85]:
print("Fitting the NMF model with tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model with tf-idf features
done in 0.878s.


In [86]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: police gates officers said brutality commission department angeles los chief officer mr city report force mayor black abortion complaints incident
Topic #1: exxon oil spill valdez cleanup said tanker alaska ship sound guard coast reef prince miles developments million crude hazelwood vessel
Topic #2: hurricane hurricanes sheets storm mph storms atlantic winds said florida tropical gilbert gray season hugo center forecasters miami damage louisiana
Topic #3: jackson dickey beach police officer long car window nbc incident tape said hawthorne glass hannon attorney duty brutality investigation hill
Topic #4: diamond beers diamonds cso botswana market carats cartel rough sales mines south world dealers cent organization year africa production african
Topic #5: eclipse sun moon solar hawaii baja total mexico viewing eclipses shadow astronomers clouds watch partial said visible box earth 11
Topic #6: johnson lewis ben francis steroids said drug

In [87]:
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features")
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features
done in 4.956s.


In [88]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)



Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: said year state years say people time new states government president 000 going 10 million used officials just make like
Topic #1: million oil march sea valdez gallons spill mid lost exxon water related spokesman nation officials southwest manager north prince major
Topic #2: storm winds atlantic storms mph south national weather killed hurricane mexico hurricanes texas west season tropical 33 hit africa center
Topic #3: said dr national medical officer months los like times late ago michael saw start began hospital old chief turn according
Topic #4: open mines government marketing year london feed production food market germany imposed cattle group months world indicates mr sheep controls
Topic #5: sun ofthe saturday mexico solar punch pressed time plastic coast watch event total years observers moon oversee path surface recorded
Topic #6: world race 100 athletes 17 1984 79 run hours 83 coach runs seconds gold c

In [89]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 2.233s.


In [90]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_probs = tf_vectorizer.max_features
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: said eclipse slovenia lewis people sun slovene yugoslavia mexico drought time johnson don yugoslav city hawaii called world moon says
Topic #1: diabetes said tb tuberculosis aids percent health hispanics people disease risk diet 000 latinos americans cases insulin cdc inmates exercise
Topic #2: said police mr year nafta diamond officers says world slovenia beers diamonds bank new president welfare cent officer years department
Topic #3: said police year cellrule officer tunnel tablecell morgan right service cvj billion people public link black chj says london lewis
Topic #4: said mr says year officers park police gandhi commission eclipse new forest long fires chief government time trees years city
Topic #5: said hurricane tb year jackson drought city health police smith state officials years limits storms states 10 season land atlantic
Topic #6: said people thomas right mr path shining disease johnson state year gun court states new amendment time gover

In [91]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit_transform(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 2.291s.


In [92]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: said eclipse slovenia lewis people sun slovene yugoslavia mexico drought time johnson don yugoslav city hawaii called world moon says
Topic #1: diabetes said tb tuberculosis aids percent health hispanics people disease risk diet 000 latinos americans cases insulin cdc inmates exercise
Topic #2: said police mr year nafta diamond officers says world slovenia beers diamonds bank new president welfare cent officer years department
Topic #3: said police year cellrule officer tunnel tablecell morgan right service cvj billion people public link black chj says london lewis
Topic #4: said mr says year officers park police gandhi commission eclipse new forest long fires chief government time trees years city
Topic #5: said hurricane tb year jackson drought city health police smith state officials years limits storms states 10 season land atlantic
Topic #6: said people thomas right mr path shining disease johnson state year gun court states new amendment time gover

In [93]:
topic_words_prob_dict = {}

In [100]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(20):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:20] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:20])

Topic  0 :  [0.0006475863478037532, 0.0005120545781998292, 0.0003207464066457018, 0.0003185977576264271, 0.00026361234914940067, 0.00025030463927541046, 0.00024188866925490735, 0.00023908505958986531, 0.0002378833806302873, 0.0002271766298699516, 0.00022332439645910386, 0.0002167710517195366, 0.00021266269098806689, 0.00020521498352202964, 0.00020470545868026352, 0.00020304406622150779, 0.00020277878839400045, 0.00020249210994427108, 0.0002024590273189938, 0.00020228346563186888]
Topic  1 :  [0.00950405852104145, 0.005261590003500753, 0.005001010831047137, 0.004792511775928363, 0.003454352514969714, 0.002721761340314462, 0.0025738005536996866, 0.002234559802546817, 0.0018612062014215906, 0.001841464728568444, 0.0015130308432804572, 0.0014979846861810219, 0.0014853223355838644, 0.0014732841796249356, 0.0014539607014466731, 0.0014159939715439772, 0.0013864974912929603, 0.001340711348611618, 0.0012684713757583773, 0.0012280132433494324]
Topic  2 :  [0.012303508754028728, 0.011031471555396

Topic  18 :  [0.009671488244055929, 0.005775589952147708, 0.005395902627324988, 0.0033867223237599086, 0.003107356867412992, 0.0028239644494802247, 0.0027457964071937943, 0.0025972700299744895, 0.0023558922021905374, 0.0017849849045921083, 0.0017274617102486696, 0.0015051971379966664, 0.0015017127450514228, 0.0014709840749330654, 0.0014510890356371443, 0.0014459564347570892, 0.0014382875830974908, 0.001373599390223537, 0.0013528372424150268, 0.0013221795117407881]
Topic  19 :  [0.000613702489879855, 0.00028786734432122136, 0.00024116583160190835, 0.00022996161440118796, 0.0002211169420721375, 0.00020935604090389874, 0.0002068207590646675, 0.0002066807961548122, 0.00020490420732882048, 0.0002025008697428067, 0.00020078524448204958, 0.00020063746722681782, 0.0001969771964507766, 0.00019686514729066893, 0.0001936854869707564, 0.000193356267032927, 0.00019273717683856645, 0.00019062480623037667, 0.00019053587324393767, 0.0001879940060929289]


In [95]:
import nltk
import enchant
d = enchant.Dict("en_US")
def predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict):
    text = nltk.tokenize.word_tokenize(doc)
    for word in text:
        if not d.check(word):
            if len(word) < 4:
                text.remove(word)
    temp_dist = nltk.FreqDist(text)
    word_dist = {}
    for word in temp_dist:
        word_dist[word] = temp_dist[word]
    for word in temp_dist:
        if word_dist[word] > 10:
            word_dist.pop(word)
    doc_topics_prob = {}
    for topic in topic_words_dict:
        score = 0
        word_list = topic_words_dict[topic]
        prob_list = topic_words_prob_dict[topic]
        for index in range(len(word_list)):
            if word_list[index] in word_dist:
                score += word_dist[word_list[index]] * prob_list[index]
        doc_topics_prob[topic] = score
    return doc_topics_prob

In [102]:
import operator
docs = []
for item in res['hits']['hits']:
    id_ = item['_source']['doc_id']
    doc = item['_source']['doc_text']
    gold_summary = item['_source']['gold_summary']
    lda_summary = item['_source']['lda_summary']
    doc_topics = predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict)
    sorted_dict = sorted(doc_topics.items(), key=operator.itemgetter(1))
    doc_topics = sorted_dict[::-1][:5]
    docs.append({
        'doc_id' : id_,
        'doc_text': doc,
        'gold_summary': gold_summary,
        'lda_summary' : lda_summary,
        'doc_topics' : doc_topics
    })

In [105]:
docs[1]

{'doc_id': 'AP880409-0015 ',
 'doc_text': "A hurricane expert predicts a turbulent summer inthe Atlantic Ocean with more and fiercer storms swirling the seas,but says it's impossible to know if any of the storms will threatenpopulated areas.   William Gray, a professor of atmospheric science at ColoradoState University, said Friday he expects about six Atlantichurricanes this year, the average for the last 40 years, but aboveaverage for the decade.   The Atlantic has formed relatively few hurricanes in five of thelast six hurricane seasons. Those years brought just two to fivehurricanes each, except for the seven hurricanes spotted in 1985.   The hurricane season officially begins June 1, and its mostactive period usually begins Aug. 1.   Gray, who has used wind and air pressure patterns to make annualhurricane forecasts each year since 1984, will issue his firstformal 1988 forecast in late May. He issued an early ``outlook''Friday for the 10th annual National Hurricane Conference.   `

In [107]:
docs[3]

{'doc_id': 'WSJ870123-0101 ',
 'doc_text': 'In his State of the Union address last January, President Reagan announced that welfare reform would be a priority of his administration in its final three years. He instructed his charges to draw up plans for "immediate action" that would enable poor families to achieve "real and lasting emancipation" from welfare dependency. "The success of welfare," he said, "should be judged by how many of its recipients become independent of welfare."    The president\'s convincing message raised hopes that something would finally be done to break the cycle of poverty and dependency that afflicts several million welfare families. Americans have become increasingly exasperated by the enormous amount of money the federal government devotes each year to anti-poverty programs, with little apparent abatement in poverty. The president correctly sensed that the public wants reform.    If the president lets up on welfare reform, he will disappoint those whose ho

In [108]:
from elasticsearch import Elasticsearch
es = Elasticsearch('localhost')
es.indices.delete(index='duc_topic_modelling', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'words':{
            'properties':{
                'topic_id': {'type': 'text', 'index': 'false'},
                'top_words': {'type': 'text', 'analyzer': 'english'},
                'word_probs':{'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
client.indices.create(index="duc_topic_modelling", body=mappings_duc)

{'acknowledged': True,
 'index': 'duc_topic_modelling',
 'shards_acknowledged': True}

In [109]:
topics = []
for i in range(len(topic_words_dict)):
    topics.append({
        'topic_id' : str(i),
        'top_words': topic_words_dict[i],
        'word_probs': topic_words_prob_dict[i]
    })

In [110]:
for topic in topics:
    client.index(index='duc_topic_modelling', doc_type='words', body=topic)

In [111]:
es.indices.delete(index='duc', ignore=[400, 404])
mappings_duc = {
    'mappings':{
        'document':{
            'properties':{
                'doc_id': {'type': 'text', 'index': 'false'},
                'doc_text': {'type': 'text', 'analyzer': 'english'},
                'gold_summary':{'type': 'text', 'analyzer': 'english'},
                'kl_summary':{'type': 'text', 'analyzer': 'english'},
                'doc_topics' : {'type': 'text', 'analyzer': 'english'}
            }
        }
    }
}
es.indices.create(index="duc", body=mappings_duc)

{'acknowledged': True, 'index': 'duc', 'shards_acknowledged': True}

In [112]:
for doc in docs:
    es.index(index='duc', doc_type='document', body=doc)