In [4]:
from elasticsearch import Elasticsearch
from os import listdir
from os.path import isfile, join
from pprint import pprint
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import requests
import re
import os

In [5]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [6]:
# make sure ES is up and running
res = requests.get('http://localhost:9200')
print((res.content).decode('UTF-8'))  # .decode('UTF-8') is to decode a bytes object into a str

ConnectionError: HTTPConnectionPool(host='localhost', port=9200): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000017B568CBD30>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it',))

In [None]:
#connect to our cluster
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [7]:
# Fetch DUC 2001 Dataset
train_directory = abspath('datasets', 'DUC2001')
train_file_locations = [train_directory + '\\' + f for f in listdir(train_directory) if isfile(join(train_directory, f)) and f not in ['annotations.txt', 'notes.txt']]

summary_directory = abspath('datasets', 'DUC2001', 'Summaries')
summary_file_locations = [summary_directory + '\\' + f for f in listdir(summary_directory) if isfile(join(summary_directory, f)) and f not in ['annotations.txt', 'notes.txt']]

duc_data = []   # [{file_name: {'data': '', 'summary' = ''}}, {..} ... , {..}]
duc_text = []
for i in range(len(train_file_locations)):
    file = train_file_locations[i]
    file_name = file.split('\\')[-1]
    with open(file, 'r') as f:
        text_content = f.read()
        duc_text.append(text_content)
        duc_data.append({file_name: {'data': text_content, 'summary': ''}})

for i in range(len(summary_file_locations)):
    file = summary_file_locations[i]
    file_name = file.split('\\')[-1].split('.txt')[0].upper()
    with open(file, 'r') as f:
        for data in duc_data:
            if data.get(file_name):
                data[file_name]['summary'] = f.read()

In [19]:
## 20 News Groups Dataset

# Fetch data
ng_all = fetch_20newsgroups(subset='all')

ng_data = ng_all.data

# Set of label names for training and testing data
# ng_all.target_names

# Labels, where each label is associated with a number corresponding to the index in target_names
# ng_all.target

ng_labels = []

for i in range(len(ng_data)):
    ng_labels.append(ng_all.target[i])

print(len(ng_data))
print(len(ng_labels))

18846
18846


In [163]:
# Index DUC 2001 Dataset
for i in range(len(duc_data)):
    for key, value in duc_data[i].items():
        doc = {
            'doc_id': i,
            'doc_label' : '-',
            'doc_topics': '',
            'doc_content': ' '.join(value['data'].replace('\n', ' ').replace('\t', ' ').replace('<P>', ' ').replace('</P>', ' ').split()).split('<TEXT>')[-1].split('</TEXT>')[0].strip(),
            'LDA_summary': '',
            'KL_summary': '',
            'gold_summary': value['summary']
        }
        es.index(index='duc2001', id=i, doc_type='summaries', body=doc)

In [7]:
# Index 20NG Dataset
for i in range(len(ng_data)):
    doc = {
        'doc_id': i,
        'doc_label' : str(ng_labels[i]),
        'doc_topics': '',
        'doc_content': ng_data[i],
        'LDA_summary': '',
        'KL_summary': '',
        'gold_summary': ''
    }
    es.index(index='20ng', id=i, doc_type='articles', body=doc)

In [21]:
# horizontal probabilities
def get_topics_hor(topic_word_dist, feature_names, no_top_words):
    word_prob = {}
    for i in range(topic_word_dist.shape[0]):
        topic = topic_word_dist[i]
        new_topic = topic.argsort()[:-no_top_words - 1:-1]
        for j in new_topic:
            word_prob.setdefault(i, [feature_names[j], topic[j] / np.sum(topic)]).append(feature_names[j], topic[j] / np.sum(topic))
    return word_prob

In [22]:
# vertical probabilities
def get_topics_ver(topic_word_dist, feature_names, no_top_words):
    word_prob = {}
    for i in range(topic_word_dist.shape[0]):
        topic = topic_word_dist[i]
        new_topic = topic.argsort()[:-no_top_words - 1:-1]
        for j in new_topic:
            word_prob.setdefault(i, []).append([feature_names[j], topic[j] / np.sum(topic_word_dist, axis=0)[j]])
    return word_prob

In [23]:
def topic_word_probabilities_LDA(data, n_components, K):

    tfidf_vect = TfidfVectorizer()
    tfidf_data = tfidf_vect.fit_transform(data)
    tfidf_feature_names = tfidf_vect.get_feature_names()

    lda = LatentDirichletAllocation(n_components=n_components)
    lda.fit(tfidf_data)
    topic_word_prob_LDA = get_topics_ver(lda.components_, tfidf_feature_names, K)
    return topic_word_prob_LDA

In [24]:
def topic_word_probabilities_NMF(data, n_components, K):

    tfidf_vect = TfidfVectorizer()
    tfidf_data = tfidf_vect.fit_transform(data)
    tfidf_feature_names = tfidf_vect.get_feature_names()

    nmf = NMF(n_components=n_components)
    nmf.fit(tfidf_data)
    topic_word_prob_NMF = get_topics_ver(nmf.components_, tfidf_feature_names, K)
    return topic_word_prob_NMF

In [29]:
topic_word_prob_LDA = topic_word_probabilities_LDA(duc_text, 10, 20)
pprint(topic_word_prob_LDA)



{0: [['the', 0.0018570420232274167],
     ['of', 0.003288641314358488],
     ['to', 0.0039496539689205775],
     ['oil', 0.02933457443570742],
     ['tb', 0.04202242842450929],
     ['in', 0.004088602261060036],
     ['was', 0.012833631543891957],
     ['jackson', 0.057682065874539556],
     ['that', 0.007406700237354416],
     ['and', 0.00424971890672715],
     ['also', 0.03318130154294373],
     ['fire', 0.020094819294447386],
     ['bonnie', 0.10453841755377404],
     ['chapter', 0.09972460394364024],
     ['shelved', 0.10563824308756195],
     ['eliminated', 0.100717270279717],
     ['flickering', 0.10280248017623754],
     ['ship', 0.06478339985378917],
     ['alarm', 0.09461786121263521],
     ['praise', 0.09880164655058207]],
 1: [['the', 0.0023548644013887277],
     ['to', 0.004610954543933583],
     ['in', 0.004873844442191849],
     ['of', 0.003715690509403417],
     ['and', 0.004846711209003752],
     ['said', 0.011081456804697644],
     ['bank', 0.02964028394936316],
     [

In [26]:
topic_word_prob_NMF = topic_word_probabilities_NMF(duc_text, 10, 20)
pprint(topic_word_prob_NMF)

{0: [['the', 0.30566090011289304],
     ['of', 0.31422563628855205],
     ['to', 0.2931643056924947],
     ['in', 0.27709963291870254],
     ['and', 0.2637740945477754],
     ['that', 0.3573716338686716],
     ['is', 0.3397956583774059],
     ['for', 0.2904991948085256],
     ['police', 0.9381009329568606],
     ['on', 0.3275355166099969],
     ['mr', 0.6401873941916635],
     ['he', 0.2612962272671023],
     ['nafta', 1.0],
     ['it', 0.24283783607477957],
     ['by', 0.28581975920342007],
     ['as', 0.32801546604506715],
     ['said', 0.14119715536293764],
     ['his', 0.3738698517230855],
     ['welfare', 1.0],
     ['with', 0.2779227960348184]],
 1: [['hurricane', 1.0],
     ['the', 0.1044160468689293],
     ['hurricanes', 1.0],
     ['of', 0.12129532847875833],
     ['and', 0.1408711306076389],
     ['in', 0.12667153314190985],
     ['sheets', 1.0],
     ['storm', 0.9808241792880301],
     ['storms', 0.9682948856120241],
     ['to', 0.08400382561468299],
     ['atlantic', 0.9913

In [28]:
topic_word_prob_LDA = topic_word_probabilities_LDA(ng_data, 10, 20)
pprint(topic_word_prob_LDA)



KeyboardInterrupt: 

In [None]:
topic_word_prob_NMF = topic_word_probabilities_NMF(ng_data, 10, 20)
pprint(topic_word_prob_NMF)

In [None]:
topic_word_prob_LDA = topic_word_probabilities_LDA(ng_data, 10, 50)
pprint(topic_word_prob_LDA)

In [None]:
topic_word_prob_LDA = topic_word_probabilities_LDA(duc_text, 5, 10)
pprint(topic_word_prob_LDA)

In [None]:
duc_text[0]

In [None]:
# horizontal probabilities
def get_docs_hor(topic_word_dist, no_top_words):
    topic_prob = {}
    for i in range(topic_word_dist.shape[0]):
        topic = topic_word_dist[i]
        new_topic = topic.argsort()[:-no_top_words - 1:-1]
        for j in new_topic:
            topic_prob.setdefault(i, []).append(j, topic[j] / np.sum(topic))
    return topic_prob

In [None]:
# vertical probabilities
def get_docs_ver(topic_word_dist, no_top_words):
    topic_prob = {}
    for i in range(topic_word_dist.shape[0]):
        topic = topic_word_dist[i]
        new_topic_indices = topic.argsort()
        new_topic = topic.argsort()[:-no_top_words - 1:-1]
        for j in new_topic:
            topic_prob.setdefault(i, []).append([j, topic[j] / np.sum(topic_word_dist, axis=0)[j]])
    return topic_prob

In [None]:
def doc_topic_probabilities(data, n_components, K):

    tfidf_vect = TfidfVectorizer()
    tfidf_data = tfidf_vect.fit_transform(data)
    tfidf_feature_names = tfidf_vect.get_feature_names()

    print('LDA')
    lda = LatentDirichletAllocation(n_components=n_components)
    lda.fit(tfidf_data)
    doc_topic_distr = lda.transform(tfidf_data)
    doc_topic_prob = get_docs_ver(doc_topic_distr, K)
    
    return doc_topic_prob

In [None]:
doc_topic_prob_duc = doc_topic_probabilities(duc_text, 5, 10)
pprint(doc_topic_prob_duc)

In [None]:
doc_topic_prob_ng = doc_topic_probabilities(ng_data, 5, 10)
pprint(doc_topic_prob_ng)

In [None]:
for key, value in doc_topic_prob_duc.items():
    doc_topics_text = ''
    for v in value:
        doc_topics_text += str(v[0]) + ': ' + str(round(v[1], 4)) + '. '
    es.update(index='duc2001', id=key, doc_type='summaries', body={'doc': {'doc_topics': doc_topics_text}})

In [None]:
for key, value in doc_topic_prob_ng.items():
    doc_topics_text = ''
    for v in value:
        doc_topics_text += str(v[0]) + ': ' + str(round(v[1], 4)) + '. '
    print(doc_topics_text)
    print('----')
    es.update(index='20ng', id=key, doc_type='articles', body={'doc': {'doc_topics': doc_topics_text}})

In [29]:
topic_word_prob_LDA = topic_word_probabilities_LDA(duc_text, 10, 20)
pprint(topic_word_prob_LDA)



{0: [['the', 0.0018570420232274167],
     ['of', 0.003288641314358488],
     ['to', 0.0039496539689205775],
     ['oil', 0.02933457443570742],
     ['tb', 0.04202242842450929],
     ['in', 0.004088602261060036],
     ['was', 0.012833631543891957],
     ['jackson', 0.057682065874539556],
     ['that', 0.007406700237354416],
     ['and', 0.00424971890672715],
     ['also', 0.03318130154294373],
     ['fire', 0.020094819294447386],
     ['bonnie', 0.10453841755377404],
     ['chapter', 0.09972460394364024],
     ['shelved', 0.10563824308756195],
     ['eliminated', 0.100717270279717],
     ['flickering', 0.10280248017623754],
     ['ship', 0.06478339985378917],
     ['alarm', 0.09461786121263521],
     ['praise', 0.09880164655058207]],
 1: [['the', 0.0023548644013887277],
     ['to', 0.004610954543933583],
     ['in', 0.004873844442191849],
     ['of', 0.003715690509403417],
     ['and', 0.004846711209003752],
     ['said', 0.011081456804697644],
     ['bank', 0.02964028394936316],
     [

In [57]:
topic_list = []
for i in range(len(topic_word_prob_LDA)):
    topic_str = ''
    for value in topic_word_prob_LDA[i]:
        topic_str += value[0] + ' ' + str(value[1]) + ' '
    topic_list.append(topic_str)
print(len(topic_list))

10


In [66]:
# Index topic word probability of DUC 2001 Dataset
for i in range(len(topic_list)):
    doc = {
        'topic_id': i,
        'top_words': topic_list[i]
    }
    es.index(index='topic_duc', id=i, doc_type='topic_word_prob', body=doc)

0 the 0.0018570420232274167 of 0.003288641314358488 to 0.0039496539689205775 oil 0.02933457443570742 tb 0.04202242842450929 in 0.004088602261060036 was 0.012833631543891957 jackson 0.057682065874539556 that 0.007406700237354416 and 0.00424971890672715 also 0.03318130154294373 fire 0.020094819294447386 bonnie 0.10453841755377404 chapter 0.09972460394364024 shelved 0.10563824308756195 eliminated 0.100717270279717 flickering 0.10280248017623754 ship 0.06478339985378917 alarm 0.09461786121263521 praise 0.09880164655058207 
1 the 0.0023548644013887277 to 0.004610954543933583 in 0.004873844442191849 of 0.003715690509403417 and 0.004846711209003752 said 0.011081456804697644 bank 0.02964028394936316 nafta 0.031267535010313936 poverty 0.04629235646632248 tunnel 0.033707129286244183 on 0.01236255339642782 from 0.015989413429719487 will 0.018287087856238705 is 0.009733465899740695 six 0.045717115665951866 british 0.04321674538903179 hurricane 0.019949391630021192 taylor 0.029465362982817697 xx 0.

In [29]:
topic_word_prob_LDA = topic_word_probabilities_LDA(duc_text, 10, 20)
pprint(topic_word_prob_LDA)



{0: [['the', 0.0018570420232274167],
     ['of', 0.003288641314358488],
     ['to', 0.0039496539689205775],
     ['oil', 0.02933457443570742],
     ['tb', 0.04202242842450929],
     ['in', 0.004088602261060036],
     ['was', 0.012833631543891957],
     ['jackson', 0.057682065874539556],
     ['that', 0.007406700237354416],
     ['and', 0.00424971890672715],
     ['also', 0.03318130154294373],
     ['fire', 0.020094819294447386],
     ['bonnie', 0.10453841755377404],
     ['chapter', 0.09972460394364024],
     ['shelved', 0.10563824308756195],
     ['eliminated', 0.100717270279717],
     ['flickering', 0.10280248017623754],
     ['ship', 0.06478339985378917],
     ['alarm', 0.09461786121263521],
     ['praise', 0.09880164655058207]],
 1: [['the', 0.0023548644013887277],
     ['to', 0.004610954543933583],
     ['in', 0.004873844442191849],
     ['of', 0.003715690509403417],
     ['and', 0.004846711209003752],
     ['said', 0.011081456804697644],
     ['bank', 0.02964028394936316],
     [

In [57]:
topic_list = []
for i in range(len(topic_word_prob_LDA)):
    topic_str = ''
    for value in topic_word_prob_LDA[i]:
        topic_str += value[0] + ' ' + str(value[1]) + ' '
    topic_list.append(topic_str)
print(len(topic_list))

10


In [66]:
# Index topic word probability of 20NG Dataset
for i in range(len(topic_list)):
    doc = {
        'topic_id': i,
        'top_words': topic_list[i]
    }
    es.index(index='topic_ng', id=i, doc_type='topic_word_prob', body=doc)

0 the 0.0018570420232274167 of 0.003288641314358488 to 0.0039496539689205775 oil 0.02933457443570742 tb 0.04202242842450929 in 0.004088602261060036 was 0.012833631543891957 jackson 0.057682065874539556 that 0.007406700237354416 and 0.00424971890672715 also 0.03318130154294373 fire 0.020094819294447386 bonnie 0.10453841755377404 chapter 0.09972460394364024 shelved 0.10563824308756195 eliminated 0.100717270279717 flickering 0.10280248017623754 ship 0.06478339985378917 alarm 0.09461786121263521 praise 0.09880164655058207 
1 the 0.0023548644013887277 to 0.004610954543933583 in 0.004873844442191849 of 0.003715690509403417 and 0.004846711209003752 said 0.011081456804697644 bank 0.02964028394936316 nafta 0.031267535010313936 poverty 0.04629235646632248 tunnel 0.033707129286244183 on 0.01236255339642782 from 0.015989413429719487 will 0.018287087856238705 is 0.009733465899740695 six 0.045717115665951866 british 0.04321674538903179 hurricane 0.019949391630021192 taylor 0.029465362982817697 xx 0.

In [1]:
for i in range(len(topic_list)):
    es.delete(index='topic', id=i, doc_type='topic_word_prob')

NameError: name 'topic_list' is not defined