In [1]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import data
import utils
from main import extract_summary, report_rouge_scores

%load_ext autoreload
%autoreload 2

### Cornell Newsroom Summarization Dataset

Data are donwloaded from the [cornell newsroom summarization dataset](https://summari.es/). We are using the development set. We select *extractive* for our task and only include data that has 5 or more sentences in the summary.

In [2]:
# Convert newsroom jason file (dev.jsonl) to csv file (uncomment and run the code if 'news_dev.csv' is not available)
# data.newsroom_json2csv()

In [3]:
news = pd.read_csv('../data/newsroom/news_dev.csv')
print('# of articles:', len(news))
news.head()

# of articles: 2566


Unnamed: 0,title,reference,text
0,NEW YORKERS' ONLY REGRET WAS STAYING HOME,As many black men marched on Washington yester...,"This story was reported by: NICK CHARLES, AUST..."
1,Music review: Jake Bugg at the House of Blues,As the lights went down at the nearly sold-out...,As the lights went down at the nearly sold-out...
2,"HELP IS URGED FOR 36,000 HOMELESS IN CITY'S ST...",A yearlong study by the Community Service Soci...,A yearlong study by the Community Service Soci...
3,Broadway - An early contender for 1982-83 - 'I...,THE new Broadway season is barely out of the s...,THE new Broadway season is barely out of the s...
4,CIRCUS FINDS ARENA GOOD PLACE TO PLAY,EAST RUTHERFORD YOUNGSTERS squealed with laugh...,EAST RUTHERFORD YOUNGSTERS squealed with laugh...


### Summarization

- Summarization algorithms includes: 
    - SMRS (TF-IDF matrix)
    - Franke-Wolfe (TF-IDF matrix)
    - Franke-Wolfe (Sentence embeddings matrix)
- *Matlab* and *Python for matlab engine* is required to run the SMRS method. Remove `'SMRS'` from the `methods` list below if matlab is not installed.

- Main function: `extract_summary()`

```python
# Arguments:
#     - doc: string; article body text
#     - ref: string; reference summary
#     - title: string; title of the article
#     - k: number of extracted examplars
#     - print_summary: print summary text for each algorithm
#     - report_rouge: report rouge score (need to pass in ref argument)
#     - rouge_embed: use word embedding to calculate rouge score
#     - vectorize_scores: return scores in np.ndarray instead of in a dictionary
#     - methods: summarization algorithms to be used
# Return:
#     - summary: dictionary; extracted summary sentences using each algorithm
#     - word_count: dictionary; number of words in the extracted summary
#     - runtime: computation time of each algorithm
#     - scores: rouge score of each algorithm
        
summary, word_count, runtime, scores = extract_summary(doc, ref=None, title=None, k=5, print_summary=False, 
                                                       report_rouge=False, print_rouge=True, rouge_embed=False, 
                                                       vectorize_scores=False, methods=['random', 'SMRS', 'tfidf', 'embed']);

```

In [4]:
# Get list of titles, reference summaries, and body text
news_titles, news_refs, news_text = data.get_newsroom_data()

In [26]:
doc_idx = 2
doc = news_text[doc_idx]
ref = news_refs[doc_idx]
title = news_titles[doc_idx]

k=5
methods = ['SMRS', 'tfidf', 'embed']
extract_summary(doc, ref, title, k=k, report_rouge=False, methods=methods, print_summary=True);

# sentence: 27, # vocab: 400
# of selected exemplar: 5

Title: HELP IS URGED FOR 36,000 HOMELESS IN CITY'S STREETS

A yearlong study by the Community Service Society of New York has concluded that the problem of homeless people on the streets of the city has ''reached such extraordinary proportions'' that emergency housing must be set up.   The study, to be made public tomorrow, says government agencies have failed to face the problem of the homeless and have made it even worse with the state's program of discharging many patients from mental institutions into communities.   The homeless are found in almost every part of the city. Sometimes they are seen shuffling along the streets or crouched in doorways seeking temporary shelter from wind and rain. They inhabit the bus and railroad stations and subways until they are chased away by the police.
-----
Word count:122

Poignant scene at garbage can. Sometimes they are seen shuffling along the streets or crouched in doorways seeking tempo

### ROUGE Score

In [6]:
%%time
extract_summary(doc, ref, title, k=k, report_rouge=True, rouge_embed=False, 
                methods=methods, print_summary=False, print_rouge=True);



SMRS
Overlap 1-gram 			F1: 0.289
Overlap 1-gram 			Precision: 0.315
Overlap 1-gram 			Recall: 0.267
Overlap bi-gram 		F1: 0.182
Overlap bi-gram 		Precision: 0.212
Overlap bi-gram 		Recall: 0.159
Longest Common Subsequence 	F1: 0.286
Longest Common Subsequence 	Precision: 0.315
Longest Common Subsequence 	Recall: 0.267

tfidf
Overlap 1-gram 			F1: 0.290
Overlap 1-gram 			Precision: 0.242
Overlap 1-gram 			Recall: 0.360
Overlap bi-gram 		F1: 0.114
Overlap bi-gram 		Precision: 0.096
Overlap bi-gram 		Recall: 0.142
Longest Common Subsequence 	F1: 0.218
Longest Common Subsequence 	Precision: 0.195
Longest Common Subsequence 	Recall: 0.291

embed
Overlap 1-gram 			F1: 0.365
Overlap 1-gram 			Precision: 0.397
Overlap 1-gram 			Recall: 0.337
Overlap bi-gram 		F1: 0.267
Overlap bi-gram 		Precision: 0.303
Overlap bi-gram 		Recall: 0.239
Longest Common Subsequence 	F1: 0.348
Longest Common Subsequence 	Precision: 0.384
Longest Common Subsequence 	Recall: 0.326
CPU times: user 8.76 s, sys: 429 m

### Word Embedding ROUGE Score

In [19]:
%%time
extract_summary(doc, ref, title, k=k, report_rouge=True, rouge_embed=True, 
                methods=methods, print_summary=False, print_rouge=True);



SMRS
Overlap 1-gram 			F1: 0.742
Overlap 1-gram 			Precision: 0.777
Overlap 1-gram 			Recall: 0.710
Overlap bi-gram 		F1: 0.793
Overlap bi-gram 		Precision: 0.809
Overlap bi-gram 		Recall: 0.778
Longest Common Subsequence 	F1: 0.758
Longest Common Subsequence 	Precision: 0.836
Longest Common Subsequence 	Recall: 0.710

tfidf
Overlap 1-gram 			F1: 0.748
Overlap 1-gram 			Precision: 0.726
Overlap 1-gram 			Recall: 0.771
Overlap bi-gram 		F1: 0.794
Overlap bi-gram 		Precision: 0.780
Overlap bi-gram 		Recall: 0.808
Longest Common Subsequence 	F1: 0.577
Longest Common Subsequence 	Precision: 0.518
Longest Common Subsequence 	Recall: 0.771

embed
Overlap 1-gram 			F1: 0.766
Overlap 1-gram 			Precision: 0.803
Overlap 1-gram 			Recall: 0.732
Overlap bi-gram 		F1: 0.820
Overlap bi-gram 		Precision: 0.838
Overlap bi-gram 		Recall: 0.803
Longest Common Subsequence 	F1: 0.782
Longest Common Subsequence 	Precision: 0.862
Longest Common Subsequence 	Recall: 0.732
CPU times: user 1min 34s, sys: 3.9

### ROUGE Score Across Documents

In [33]:
k = 5
start = 20
num_articles = 20
articles = news_text[start : start + num_articles]
references = news_refs[start : start + num_articles]
titles = news_titles[start : start + num_articles]

In [34]:
%%time
rouge_mean, rouge_median, rouge_std = report_rouge_scores(articles, references, titles, k, methods=methods)

index =  ['1-gram F1', '1-gram Precision', '1-gram Recall', 'bi-gram F1', 'bi-gram Precision', 'bi-gram Recall', 
          'longest common F1', 'longest common Precision', 'longest common Recall', 'runtime', 'word count']

print('=' * 22 + ' Mean ' + '=' * 22)
rouge_mean.index = index
display(rouge_mean)

# print('=' * 21 + ' Median ' + '=' * 21)
# rouge_median.index = index
# display(rouge_median)

# print('=' * 15 + ' Standard Deviation ' + '=' * 15)
# rouge_std.index = index
# display(rouge_std)



Unnamed: 0,SMRS,tfidf,embed
1-gram F1,0.179161,0.250885,0.251754
1-gram Precision,0.323557,0.283636,0.46176
1-gram Recall,0.15344,0.243489,0.187242
bi-gram F1,0.090849,0.102822,0.147635
bi-gram Precision,0.151322,0.111961,0.284388
bi-gram Recall,0.078691,0.100405,0.106724
longest common F1,0.15081,0.221773,0.198754
longest common Precision,0.320464,0.262709,0.450307
longest common Recall,0.151265,0.225166,0.183257
runtime,0.591946,0.008317,0.002086


CPU times: user 3min 19s, sys: 9.84 s, total: 3min 29s
Wall time: 4min 33s


### Word Embedding ROUGE Score Across Documents

In [32]:
%%time
rouge_mean_embed, rouge_median_embed, rouge_std_embed = report_rouge_scores(articles, references, titles, k, 
                                                                            rouge_embed=True, methods=methods)

print('=' * 22 + ' Mean ' + '=' * 22)
rouge_mean_embed.index = index
display(rouge_mean_embed)

# print('=' * 21 + ' Median ' + '=' * 21)
# rouge_median_embed.index = index
# display(rouge_median_embed)

# print('=' * 15 + ' Standard Deviation ' + '=' * 15)
# rouge_std_embed.index = index
# display(rouge_std_embed)



Unnamed: 0,SMRS,tfidf,embed
1-gram F1,0.675342,0.719069,0.704342
1-gram Precision,0.754768,0.727654,0.794987
1-gram Recall,0.615283,0.71431,0.636428
bi-gram F1,0.737543,0.780718,0.761497
bi-gram Precision,0.791181,0.787599,0.812032
bi-gram Recall,0.692878,0.775532,0.718319
longest common F1,0.6678,0.685582,0.686588
longest common Precision,1.676865,0.885565,1.918299
longest common Recall,0.615283,0.71431,0.636428
runtime,0.445395,0.004235,0.001116


CPU times: user 15min 50s, sys: 49.6 s, total: 16min 39s
Wall time: 17min 13s
