In [1]:
from elasticsearch import Elasticsearch
from os import listdir
from os.path import isfile, join
from pprint import pprint
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk import sent_tokenize
from collections import Counter
from math import log
import numpy as np
import requests
import time
import os

In [2]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [4]:
# make sure ES is up and running
res = requests.get('http://localhost:9200')
print((res.content).decode('UTF-8'))  # .decode('UTF-8') is to decode a bytes object into a str

{
  "name" : "m9haKfx",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "HBzbQIPNQfqcLDGtTuR9oQ",
  "version" : {
    "number" : "6.2.3",
    "build_hash" : "c59ff00",
    "build_date" : "2018-03-13T10:06:29.741383Z",
    "build_snapshot" : false,
    "lucene_version" : "7.2.1",
    "minimum_wire_compatibility_version" : "5.6.0",
    "minimum_index_compatibility_version" : "5.0.0"
  },
  "tagline" : "You Know, for Search"
}



In [5]:
#connect to our cluster
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [6]:
train_directory = abspath('datasets', 'DUC2001')
train_file_locations = [train_directory + '\\' + f for f in listdir(train_directory) if isfile(join(train_directory, f)) and f not in ['annotations.txt', 'notes.txt']]

duc_text = []
for i in range(len(train_file_locations)):
    duc_text.append(es.get(index='duc2001', id=i, doc_type='summaries')['_source']['doc_content'])
pprint(duc_text[0])

ng_text = []
for i in range(len(fetch_20newsgroups(subset='all').data)):
    ng_text.append(es.get(index='20ng', id=i, doc_type='articles')['_source']['doc_content'])
pprint(ng_text[0])

('Millions of gallons of crude oil that spilled when a tanker ran aground '
 "spread across a wildlife-rich stretch of ocean Saturday, and Alaska's chief "
 'environmental officer criticized cleanup efforts as too slow. The biggest '
 'oil spill in U.S. history created a slick about seven miles long and seven '
 'miles wide in Prince William Sound. The Coast Guard said only Reef Island '
 'and the western edge of Bligh Island had been touched by the slick. ``This '
 "situation, I think, was everyone's secret nightmare about what could happen "
 "with oil traffic in the sound,'' said Dennis Kelso, commissioner of the "
 'Alaska Department of Environmental Conservation. Some 240,000 barrels _ '
 "about 10,080,000 gallons _ of crude oil from Alaska's North Slope spilled "
 'early Friday when the 987-foot tanker Exxon Valdez ran hard aground on Bligh '
 'Reef, about 25 miles outside Valdez, where it had taken on a total cargo of '
 '1.2 million barrels. Initial reports indicated 270,000 ba

In [None]:
# Cleaning and preprocessing
for i in range(len(duc_text)):
    duc_text[i] = sent_tokenize(' '.join(duc_text[i].lower().replace('\n', ' ').replace('\t', ' ').split()))

for i in range(len(ng_text)):
    ng_text[i] = sent_tokenize(' '.join(ng_text[i].lower().replace('\n', ' ').replace('\t', ' ').split()))

In [None]:
def kl_divergence(a, b):
    a = sum(a, Counter())
    b = sum(b, Counter())
    kl_divergence = 0
    for key, value in a.items():
        if key in b:
            kl_divergence += (value * log(value / b[key]))
    return kl_divergence

In [None]:
def kl_summary(original_sentences, summary_length):
    summary = []
    indices = []
    sentences = []
    
    summary_length = len(original_sentences) if summary_length > len(original_sentences) else summary_length
    
    for i in range(len(original_sentences)):
        sentences.append(Counter(original_sentences[i].split()))
        
    for i in range(summary_length):
        ans = {}
        summary_counter = Counter()
        for j in range(len(sentences)):
            if j not in indices:
                if len(summary) == 0:
                    ans[j] = [indices + [j], kl_divergence(sentences, [sentences[j]])]
            
                else:
                    summary_counter = Counter(' '.join(summary).split())
                    ans[j] = [indices + [j], kl_divergence(sentences, [summary_counter + sentences[j]])]
            
        for key, value in ans.items():
            min_kl = 9999
            indices = []
            for value in list(ans.values()):
                if value[1] < min_kl:
                    min_kl = value[1]
                    indices = value[0]
                    
        indices.sort()
        summary = ' '.join([original_sentences[i] for i in indices])
    return summary

In [2]:
duc_kl_summary = []
total_time = 0
for i in range(len(duc_text)):
    start_time = time.time()
    duc_kl_summary.append(kl_summary(duc_text[i], 5))
    end_time = time.time()
    total_time += end_time - start_time
    print(i, round((end_time - start_time), 4), len(duc_text[i]))
    if i % 100 == 0:
        print('-------')
        print(total_time)
        print('-------')
        total_time = 0
print('-------')
print(total_time)
print('-------')

NameError: name 'duc_text' is not defined

In [None]:
for i in range(len(duc_text)):
    es.update(index='duc2001', id=i, doc_type='summaries', body={'doc': {'KL_summary': duc_kl_summary[i]}})

In [None]:
ng_kl_summary = []
for i in range(len(ng_text)):
    print(i)
    ng_kl_summary.append(kl_summary(ng_text[i], 5))

In [None]:
for i in range(len(ng_text)):
    es.update(index='20ng', id=i, doc_type='articles', body={'doc': {'KL_summary': ng_kl_summary[i]}})