# Hierarchical Multi-Document Summarization

Basic pipeline:
1. Vectorize all documents. Each doc is a cluster at the start of HAC
2. Merge the clusters with the highest cosine similarity between them. Because this is HAC, we may
   use "complete", "average", or another method.
3. Pick candidate summaries by mixing and matching sentences from each document. Picking good
   candidates (i.e. not the Cartesian product) is non-trivial, and one of many existing methods
   in the literature may be used. Vectorize the candidate summaries.
4. Pick the summary with the highest cosine similarity to the cluster average.
5. Repeat 2-4 until all clusters have merged.

In [3]:
# Preliminary
import os
import re
import pandas as pd
import gensim as g

BASE_PATH = '/home/trevor/Projects/iot-diff/'
D2V_APNEWS_PATH = os.path.join(BASE_PATH, 'iot-tc/apnews_dbow/doc2vec.bin')
CNET_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/cnet-classifications.csv')
DUC2006_RAW_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2006/duc2006_docs')
DUC2006_MODELS_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2006/NISTeval/ROUGE/models')
DUC2006_PEERS_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2006/NISTeval/ROUGE/peers')
DUC2007_RAW_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2007/duc2007_testdocs/main')
DUC2007_MODELS_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2007/mainEval/ROUGE/models')
DUC2007_PEERS_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2007/mainEval/ROUGE/peers')
CORENLP_PATH = os.path.join(BASE_PATH, 'stanford-corenlp/')

def clean_duc_text(raw_text):
    content = re.split('<\/?TEXT>', raw_text)[1]
    clean_content = '\n'.join(re.split('<\/?(?:.*?)>', content))
    return clean_content

def load_duc_data(raw_path):
    data = {
         'topic': [],
         'document': [],
         'content': []
    }
    for root, dirs, files in os.walk(raw_path):
        # Ignore the top directory
        if files:
            topic = os.path.basename(os.path.normpath(root))
            for name in files:
                document = name
                with open(os.path.join(root, name)) as f:
                    content = clean_duc_text(f.read())
                    data['topic'].append(topic)
                    data['document'].append(document)
                    data['content'].append(content)
    duc_df = pd.DataFrame.from_dict(data)
    return duc_df

doc2vec = g.models.doc2vec.Doc2Vec.load(D2V_APNEWS_PATH)
cnet_df = pd.read_csv(CNET_PATH)
duc2006_df = load_duc_data(DUC2006_RAW_PATH)
duc2007_df = load_duc_data(DUC2007_RAW_PATH)

In [4]:
# Vectorize all documents. Each document represents a cluster at the start of HAC.
import stanfordcorenlp as corenlp
import nltk
#nltk.download('stopwords')
def preprocess(text):
    # tokenize
    nlp = corenlp.StanfordCoreNLP(os.path.join(CORENLP_PATH))
    tokens = nlp.word_tokenize(text)
    # remove punctuation
    tokens = [token for token in tokens if any(c.isalnum() for c in token)]
    # lower case
    tokens = [token.lower() for token in tokens]
    # remove stop words
    stop_words = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stop_words]
    nlp.close()
    return tokens

In [8]:
import pickle

# CNET
print('CNET')
cnet_docvecs = []
for i, row in cnet_df.iterrows():
    if i % 100 == 0:
       print('Iteration {}'.format(i))
    text = row['article_content']
    cnet_docvecs.append(doc2vec.infer_vector(preprocess(text)))
with open('d2v_apnews_cnet.pkl', 'wb') as f:
     pickle.dump(cnet_docvecs, f)

# DUC 2006
print('\nDUC 2006')
duc2006_docvecs = []
for i, row in duc2006_df.iterrows():
    if i % 100 == 0:
       print('Iteration {}'.format(i))
    text = row['content']
    duc2006_docvecs.append(doc2vec.infer_vector(preprocess(text)))
with open('d2v_apnews_duc2006.pkl', 'wb') as f:
     pickle.dump(duc2006_docvecs, f)

# DUC 2007
print('\nDUC 2007')
duc2007_docvecs = []
for i, row in duc2007_df.iterrows():
    if i % 100 == 0:
       print('Iteration {}'.format(i))
    text = row['content']
    duc2007_docvecs.append(doc2vec.infer_vector(preprocess(text)))
with open('d2v_apnews_duc2007.pkl', 'wb') as f:
     pickle.dump(duc2007_docvecs, f)


DUC 2007
Iteration 0


Iteration 100


Iteration 200


Iteration 300


Iteration 400


Iteration 500


Iteration 600


Iteration 700


Iteration 800


Iteration 900


Iteration 1000


Iteration 1100


In [5]:
# Load the pre-computed DataFrames
import pickle

with open('../d2v_apnews_cnet.pkl', 'rb') as f:
     cnet_docvecs = pickle.load(f)
with open('d2v_apnews_duc2006.pkl', 'rb') as f:
     duc2006_docvecs = pickle.load(f)
with open('d2v_apnews_duc2007.pkl', 'rb') as f:
     duc2007_docvecs = pickle.load(f)

In [6]:
# Perform HAC. In this basic version, HAC is uninformed by summarization
# linkage_matrix is an Nx4 matrix, where each column is:
#   1) Constituent cluster 1
#   2) Constituent cluster 2
#   3) Distance between 1 and 2
#   4) Num original observations in resulting cluster
import numpy as np
from scipy.cluster.hierarchy import linkage

X = np.asarray(duc2007_docvecs)
linkage_matrix = linkage(X, 'average', 'cosine')

from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

c, coph_dists = cophenet(linkage_matrix, pdist(X, 'cosine'))
c

0.6081460206326501

In [7]:
# Perform multi-doc summarization at each step
from scipy.spatial.distance import cosine
import random
# TODO refactor notebook to turn this into a class
datasets = {
    'duc2006': {
        'df': duc2006_df,
        'raw_path': DUC2006_RAW_PATH,
        'models_path': DUC2006_MODELS_PATH,
        'peers_path': DUC2006_PEERS_PATH,
        'docvecs': duc2006_docvecs
    },
    'duc2007': {
        'df': duc2007_df,
        'raw_path': DUC2007_RAW_PATH,
        'models_path': DUC2007_MODELS_PATH,
        'peers_path': DUC2007_PEERS_PATH,
        'docvecs': duc2007_docvecs
    }
}

# TODO these are just placeholders
def get_candidate_sentences(doc_list):
    sentences = []
    for d in doc_list:
        sentences.extend(d.split('.')[0:3])
    return sentences
def get_candidate_summaries(sentences):
    summaries = ['. '.join([random.choice(sentences) for _ in range(3)]) for _ in range(5)]
    return summaries

In [42]:
# Compute summaries for each merge of HAC
best_summaries = {}
for dataset_name, dataset in datasets.items():
    # Get the best summaries at each depth (or, e.g., every 100 depths)
    N = len(dataset['df'])
    best_summaries[dataset_name] = {}
    clusters = {i: [i] for i in range(N)}
    for i, row in enumerate(linkage_matrix):
        # Only compute for every 100 merges, so we get around 10 ROUGE scores (naive)
        if i % 100 == 0:
            print('Iteration {}'.format(i))
        clusters[N+i] = clusters[row[0]] + clusters[row[1]]
        doc_list = [r['content'] for _,r in dataset['df'].iloc[clusters[N+i]].iterrows()]
        candidates = get_candidate_summaries(get_candidate_sentences(doc_list))
        # Evaluate each candidate and pick the best one
        cluster_avg = np.mean(X[clusters[N+i]], axis=0)
        best_summary = {'score': -1, 'summary': ''}
        for j, candidate in enumerate(candidates):
            candidate_vec = doc2vec.infer_vector(preprocess(candidate))
            candidate_score = cosine(cluster_avg, candidate_vec)
            if candidate_score > best_summary['score']:
                best_summary['score'] = candidate_score
                best_summary['summary'] = candidate
        best_summaries[dataset_name][i] = {'score': best_summary['score'], 'summary': best_summary['summary'] 

Iteration 0


Iteration 100


Iteration 200


Iteration 300


Iteration 400


Iteration 500


Iteration 600


Iteration 700


Iteration 800


Iteration 900


Iteration 1000


Iteration 1100


In [46]:
with open('best_summaries_apnews_duc2007.pkl', 'wb') as f:
     pickle.dump(best_summaries, f)

In [8]:
with open('best_summaries_apnews_duc2007.pkl', 'rb') as f:
     best_summaries = pickle.load(f)

In [None]:
datasets['duc2006']['df']

In [41]:
# Evaluate
# ROUGE (not hierarchical)
# Evaluation with ROUGE requires root (not just sudo), so the workflow is:
# 1) Place the summaries in each duc dataset's "peers" folder
# 2) Go to shell as root (using `su`) and run `./ROUGE-1.5.5.pl -n 2 -x -m -u -c 95 -r 1000 -f A -p 0.5 -t 0 -a -d rougejk.in`
# 3) Present the results

for dataset_name, dataset in datasets.items():                                           
    docvecs = np.concatenate([v.reshape((300, 1)).T for v in dataset['docvecs']])
    print(docvecs.shape)
    for topic_name, group in dataset['df'].groupby('topic'):
        print('Summarizing topic {}'.format(topic_name))
        topic_vec = np.mean(docvecs[group.index], axis=0)
        doc_list = [r['content'] for _,r in group.iterrows()]
        candidates = get_candidate_summaries(get_candidate_sentences(doc_list))
        best_summary = {'score': 2, 'summary': ''}
        for candidate in candidates:
            candidate_vec = doc2vec.infer_vector(preprocess(candidate))
            candidate_score = cosine(topic_vec, candidate_vec)
            if candidate_score < best_summary['score']:
                best_summary['score'] = candidate_score
                best_summary['summary'] = candidate 
        file_path = os.path.join(dataset['peers_path'], '{}.ours'.format(topic_name))
        with open(file_path, 'w') as f:
            f.write(best_summary['summary'])

(1250, 300)
Summarizing topic D0601A


Summarizing topic D0602B


Summarizing topic D0603C


Summarizing topic D0604D


Summarizing topic D0605E


Summarizing topic D0606F


Summarizing topic D0607G


Summarizing topic D0608H


Summarizing topic D0609I


Summarizing topic D0610A


Summarizing topic D0611B


Summarizing topic D0612C


Summarizing topic D0613D


Summarizing topic D0614E


Summarizing topic D0615F


Summarizing topic D0616G


Summarizing topic D0617H


Summarizing topic D0618I


Summarizing topic D0619A


Summarizing topic D0620B


Summarizing topic D0621C


Summarizing topic D0622D


Summarizing topic D0623E


Summarizing topic D0624F


Summarizing topic D0625G


Summarizing topic D0626H


Summarizing topic D0627I


Summarizing topic D0628A


Summarizing topic D0629B


Summarizing topic D0630C


Summarizing topic D0631D


Summarizing topic D0632E


Summarizing topic D0633F


Summarizing topic D0634G


Summarizing topic D0635H


Summarizing topic D0636I


Summarizing topic D0637A


Summarizing topic D0638B


Summarizing topic D0639C


Summarizing topic D0640D


Summarizing topic D0641E


Summarizing topic D0642F


Summarizing topic D0643G


Summarizing topic D0644H


Summarizing topic D0645I


Summarizing topic D0646A


Summarizing topic D0647B


Summarizing topic D0648C


Summarizing topic D0649D


Summarizing topic D0650E


(1125, 300)
Summarizing topic D0701A


Summarizing topic D0702A


Summarizing topic D0703A


Summarizing topic D0704A


Summarizing topic D0705A


Summarizing topic D0706B


Summarizing topic D0707B


Summarizing topic D0708B


Summarizing topic D0709B


Summarizing topic D0710C


Summarizing topic D0711C


Summarizing topic D0712C


Summarizing topic D0713C


Summarizing topic D0714D


Summarizing topic D0715D


Summarizing topic D0716D


Summarizing topic D0717D


Summarizing topic D0718D


Summarizing topic D0719E


Summarizing topic D0720E


Summarizing topic D0721E


Summarizing topic D0722E


Summarizing topic D0723F


Summarizing topic D0724F


Summarizing topic D0725F


Summarizing topic D0726F


Summarizing topic D0727G


Summarizing topic D0728G


Summarizing topic D0729G


Summarizing topic D0730G


Summarizing topic D0731G


Summarizing topic D0732H


Summarizing topic D0733H


Summarizing topic D0734H


Summarizing topic D0735H


Summarizing topic D0736H


Summarizing topic D0737I


Summarizing topic D0738I


Summarizing topic D0739I


Summarizing topic D0740I


Summarizing topic D0741I


Summarizing topic D0742J


Summarizing topic D0743J


Summarizing topic D0744J


Summarizing topic D0745J


In [49]:
# Create the input files for ROUGE for each dataset
for dataset_name, dataset in datasets.items():
    topics = {}
    for _, _, files in os.walk(dataset['peers_path']):
        # TODO update if we have more than one peer
        for name in files:
            topic = name.split('.')[0][:-1]
            topics[topic] = {'peers': name, 'models': -1}
    for _, _, files in os.walk(dataset['models_path']):
        for t, d in topics.items():
            d['models'] = [f for f in files if f.split('.')[0] == t]
    # Now write the results
    with open('rouge_{}.in'.format(dataset_name), 'w') as f:
        f.write('<ROUGE_EVAL version="1.5.5">\n')
        for t, d in topics.items():
            f.write('<EVAL ID="{}">\n'.format(t))
            f.write('<PEER-ROOT>\n{}\n</PEER-ROOT>\n'.format(dataset['peers_path']))
            f.write('<MODEL-ROOT>\n{}\n</MODEL-ROOT>\n'.format(dataset['models_path']))
            f.write('<INPUT-FORMAT TYPE="SPL">\n')
            f.write('</INPUT-FORMAT>\n')
            f.write('<PEERS>\n')
            f.write('<P ID="{}">{}</P>\n'.format('ours', d['peers']))
            f.write('</PEERS>\n')
            f.write('<MODELS>\n')
            for model in d['models']:
                f.write('<M ID="{}">{}</M>\n'.format(model.split('.')[-1], model))
            f.write('</MODELS>\n')
            f.write('</EVAL>\n')
        f.write('</ROUGE_EVAL>\n')