# Hierarchical Multi-Document Summarization

Basic pipeline:
1. Vectorize all documents. Each doc is a cluster at the start of HAC
2. Merge the clusters with the highest cosine similarity between them. Because this is HAC, we may
   use "complete", "average", or another method.
3. Pick candidate summaries by mixing and matching sentences from each document. Picking good
   candidates (i.e. not the Cartesian product) is non-trivial, and one of many existing methods
   in the literature may be used. Vectorize the candidate summaries.
4. Pick the summary with the highest cosine similarity to the cluster average.
5. Repeat 2-4 until all clusters have merged.

In [1]:
# Preliminary
import os
import re
import pandas as pd
import gensim as g

BASE_PATH = '/home/trevor/Projects/iot-diff/'
D2V_APNEWS_PATH = os.path.join(BASE_PATH, 'iot-tc/apnews_dbow/doc2vec.bin')
DATA_CNET_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/cnet-classifications.csv')
DATA_DUC2006_RAW_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2006/duc2006_docs')
DATA_DUC2007_RAW_PATH = os.path.join(BASE_PATH, 'iot-tc/datasets/duc2007/duc2007_testdocs/main')
CORENLP_PATH = os.path.join(BASE_PATH, 'stanford-corenlp/')

def clean_duc_text(raw_text):
    content = re.split('<\/?TEXT>', raw_text)[1]
    clean_content = '\n'.join(re.split('<\/?(?:.*?)>', content))
    return clean_content

def load_duc_data(raw_path):
    data = {
         'topic': [],
         'document': [],
         'content': []
    }
    for root, dirs, files in os.walk(raw_path):
        # Ignore the top directory
        if files:
            topic = os.path.basename(os.path.normpath(root))
            for name in files:
                document = name
                with open(os.path.join(root, name)) as f:
                    content = clean_duc_text(f.read())
                    data['topic'].append(topic)
                    data['document'].append(document)
                    data['content'].append(content)
    duc_df = pd.DataFrame.from_dict(data)
    return duc_df

doc2vec = g.models.doc2vec.Doc2Vec.load(D2V_APNEWS_PATH)
cnet_df = pd.read_csv(DATA_CNET_PATH)
duc2006_df = load_duc_data(DATA_DUC2006_RAW_PATH)
duc2007_df = load_duc_data(DATA_DUC2007_RAW_PATH)

In [32]:
# Vectorize all documents. Each document represents a cluster at the start of HAC.
import stanfordcorenlp as corenlp
import nltk
#nltk.download('stopwords')
def preprocess(text):
    # tokenize
    nlp = corenlp.StanfordCoreNLP(os.path.join(CORENLP_PATH))
    tokens = nlp.word_tokenize(text)
    # remove punctuation
    tokens = [token for token in tokens if any(c.isalnum() for c in token)]
    # lower case
    tokens = [token.lower() for token in tokens]
    # remove stop words
    stop_words = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stop_words]
    nlp.close()
    return tokens

In [8]:
import pickle

# CNET
print('CNET')
cnet_docvecs = []
for i, row in cnet_df.iterrows():
    if i % 100 == 0:
       print('Iteration {}'.format(i))
    text = row['article_content']
    cnet_docvecs.append(doc2vec.infer_vector(preprocess(text)))
with open('d2v_apnews_cnet.pkl', 'wb') as f:
     pickle.dump(cnet_docvecs, f)

# DUC 2006
print('\nDUC 2006')
duc2006_docvecs = []
for i, row in duc2006_df.iterrows():
    if i % 100 == 0:
       print('Iteration {}'.format(i))
    text = row['content']
    duc2006_docvecs.append(doc2vec.infer_vector(preprocess(text)))
with open('d2v_apnews_duc2006.pkl', 'wb') as f:
     pickle.dump(duc2006_docvecs, f)

# DUC 2007
print('\nDUC 2007')
duc2007_docvecs = []
for i, row in duc2007_df.iterrows():
    if i % 100 == 0:
       print('Iteration {}'.format(i))
    text = row['content']
    duc2007_docvecs.append(doc2vec.infer_vector(preprocess(text)))
with open('d2v_apnews_duc2007.pkl', 'wb') as f:
     pickle.dump(duc2007_docvecs, f)


DUC 2007
Iteration 0


Iteration 100


Iteration 200


Iteration 300


Iteration 400


Iteration 500


Iteration 600


Iteration 700


Iteration 800


Iteration 900


Iteration 1000


Iteration 1100


In [2]:
# Load the pre-computed DataFrames
import pickle

with open('../d2v_apnews_cnet.pkl', 'rb') as f:
     cnet_docvecs = pickle.load(f)
with open('d2v_apnews_duc2006.pkl', 'rb') as f:
     duc2006_docvecs = pickle.load(f)
with open('d2v_apnews_duc2007.pkl', 'rb') as f:
     duc2007_docvecs = pickle.load(f)

In [13]:
# Perform HAC. In this basic version, HAC is uninformed by summarization
# linkage_matrix is an Nx4 matrix, where each column is:
#   1) Constituent cluster 1
#   2) Constituent cluster 2
#   3) Distance between 1 and 2
#   4) Num original observations in resulting cluster
import numpy as np
from scipy.cluster.hierarchy import linkage

X = np.asarray(duc2007_docvecs)
linkage_matrix = linkage(X, 'average', 'cosine')

from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

c, coph_dists = cophenet(linkage_matrix, pdist(X, 'cosine'))
c

0.6081460206326501

In [42]:
# Perform multi-doc summarization at each step
from scipy.spatial.distance import cosine
import random

df = duc2007_df

# TODO placeholders, these are awful
def get_candidate_sentences(doc_list):
    sentences = []
    for d in doc_list:
        sentences.extend(d.split('.')[0:3])
    return sentences

def get_candidate_summaries(sentences):
    summaries = ['. '.join([random.choice(sentences) for _ in range(3)]) for _ in range(5)]
    return summaries

N = len(df)
best_summaries = {}
clusters = {i: [i] for i in range(N)}
for i, row in enumerate(linkage_matrix):
    if i % 100 == 0:
        print('Iteration {}'.format(i))
    clusters[N+i] = clusters[row[0]] + clusters[row[1]]
    doc_list = [r['content'] for _,r in df.iloc[clusters[N+i]].iterrows()]
    candidates = get_candidate_summaries(get_candidate_sentences(doc_list))
    # Evaluate each candidate and pick the best one
    cluster_avg = np.mean(X[clusters[N+i]], axis=0)
    best_summary = {'score': -1, 'summary': ''}
    for j, candidate in enumerate(candidates):
        candidate_vec = doc2vec.infer_vector(preprocess(candidate))
        candidate_score = cosine(cluster_avg, candidate_vec)
        if candidate_score > best_summary['score']:
            best_summary['score'] = candidate_score
            best_summary['summary'] = candidate
    best_summaries[i] = {'score': best_summary['score'], 'summary': best_summary['summary']}

Iteration 0


Iteration 100


Iteration 200


Iteration 300


Iteration 400


Iteration 500


Iteration 600


Iteration 700


Iteration 800


Iteration 900


Iteration 1000


Iteration 1100


In [46]:
with open('best_summaries_apnews_duc2007.pkl', 'wb') as f:
     pickle.dump(best_summaries, f)