In [1]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import data
import utils
from main import extract_summary, report_rouge_scores

%load_ext autoreload
%autoreload 2

### Cornell Newsroom Summarization Dataset

Data are donwloaded from the [cornell newsroom summarization dataset](https://summari.es/). We are using the development set. We select *extractive* for our task and only include data that has 5 or more sentences in the summary.

In [2]:
# Convert newsroom jason file (dev.jsonl) to csv file (uncomment and run the code if 'news_dev.csv' is not available)
# data.newsroom_json2csv()

In [3]:
news = pd.read_csv('../data/newsroom/news_dev.csv')
print('# of articles:', len(news))
news.head()

# of articles: 2566


Unnamed: 0,title,reference,text
0,NEW YORKERS' ONLY REGRET WAS STAYING HOME,As many black men marched on Washington yester...,"This story was reported by: NICK CHARLES, AUST..."
1,Music review: Jake Bugg at the House of Blues,As the lights went down at the nearly sold-out...,As the lights went down at the nearly sold-out...
2,"HELP IS URGED FOR 36,000 HOMELESS IN CITY'S ST...",A yearlong study by the Community Service Soci...,A yearlong study by the Community Service Soci...
3,Broadway - An early contender for 1982-83 - 'I...,THE new Broadway season is barely out of the s...,THE new Broadway season is barely out of the s...
4,CIRCUS FINDS ARENA GOOD PLACE TO PLAY,EAST RUTHERFORD YOUNGSTERS squealed with laugh...,EAST RUTHERFORD YOUNGSTERS squealed with laugh...


### Summarization

- Methods includes: 
    - SMRS (TF-IDF matrix)
    - Franke-Wolfe (TF-IDF matrix)
    - Franke-Wolfe (Sentence embeddings matrix)
- *Matlab* and *Python for matlab engine* is required to run the SMRS method. Remove `'SMRS'` from the `methods` list below if matlab is not installed.

In [4]:
# Get list of titles, reference summaries, and body text
news_titles, news_refs, news_text = data.get_newsroom_data()

In [5]:
doc_idx = 2
doc = news_text[doc_idx]
ref = news_refs[doc_idx]
title = news_titles[doc_idx]

k=5
methods = ['SMRS', 'tfidf', 'embed']
extract_summary(doc, ref, title, k=k, report_rouge=False, methods=methods, print_summary=True);

# sentence: 27, # vocab: 400
# of selected exemplar: 5

Title: HELP IS URGED FOR 36,000 HOMELESS IN CITY'S STREETS

A yearlong study by the Community Service Society of New York has concluded that the problem of homeless people on the streets of the city has ''reached such extraordinary proportions'' that emergency housing must be set up.   The study, to be made public tomorrow, says government agencies have failed to face the problem of the homeless and have made it even worse with the state's program of discharging many patients from mental institutions into communities.   The homeless are found in almost every part of the city. Sometimes they are seen shuffling along the streets or crouched in doorways seeking temporary shelter from wind and rain. They inhabit the bus and railroad stations and subways until they are chased away by the police.

Poignant scene at garbage can. Sometimes they are seen shuffling along the streets or crouched in doorways seeking temporary shelter from win

### ROUGE Score

In [6]:
%%time
summary, runtime, scores = extract_summary(doc, ref, title, k=k, report_rouge=True, rouge_embed=False, 
                                           methods=methods, print_summary=False, print_rouge=True);



SMRS
Overlap 1-gram 			F1: 0.289
Overlap 1-gram 			Precision: 0.315
Overlap 1-gram 			Recall: 0.267
Overlap bi-gram 		F1: 0.182
Overlap bi-gram 		Precision: 0.212
Overlap bi-gram 		Recall: 0.159
Longest Common Subsequence 	F1: 0.286
Longest Common Subsequence 	Precision: 0.315
Longest Common Subsequence 	Recall: 0.267

tfidf
Overlap 1-gram 			F1: 0.290
Overlap 1-gram 			Precision: 0.242
Overlap 1-gram 			Recall: 0.360
Overlap bi-gram 		F1: 0.114
Overlap bi-gram 		Precision: 0.096
Overlap bi-gram 		Recall: 0.142
Longest Common Subsequence 	F1: 0.218
Longest Common Subsequence 	Precision: 0.195
Longest Common Subsequence 	Recall: 0.291

embed
Overlap 1-gram 			F1: 0.365
Overlap 1-gram 			Precision: 0.397
Overlap 1-gram 			Recall: 0.337
Overlap bi-gram 		F1: 0.267
Overlap bi-gram 		Precision: 0.303
Overlap bi-gram 		Recall: 0.239
Longest Common Subsequence 	F1: 0.348
Longest Common Subsequence 	Precision: 0.384
Longest Common Subsequence 	Recall: 0.326
CPU times: user 8.76 s, sys: 429 m

### Word Embedding ROUGE Score

In [7]:
%%time
summary, runtime, scores = extract_summary(doc, ref, title, k=k, report_rouge=True, rouge_embed=True, 
                                           methods=methods, print_summary=False, print_rouge=True);



SMRS
Overlap 1-gram 			F1: 0.742
Overlap 1-gram 			Precision: 0.777
Overlap 1-gram 			Recall: 0.710
Overlap bi-gram 		F1: 0.077
Overlap bi-gram 		Precision: 0.077
Overlap bi-gram 		Recall: 0.076

tfidf
Overlap 1-gram 			F1: 0.748
Overlap 1-gram 			Precision: 0.726
Overlap 1-gram 			Recall: 0.771
Overlap bi-gram 		F1: 0.092
Overlap bi-gram 		Precision: 0.077
Overlap bi-gram 		Recall: 0.114

embed
Overlap 1-gram 			F1: 0.766
Overlap 1-gram 			Precision: 0.803
Overlap 1-gram 			Recall: 0.732
Overlap bi-gram 		F1: 0.079
Overlap bi-gram 		Precision: 0.081
Overlap bi-gram 		Recall: 0.076
CPU times: user 1min 11s, sys: 3.08 s, total: 1min 14s
Wall time: 1min 17s


### ROUGE Score Across Documents

In [9]:
k = 5
num_articles = 20
articles = news_text[:num_articles]
references = news_refs[:num_articles]
titles = news_titles[:num_articles]

In [10]:
%%time
k = 5
num_articles = 20
rouge_mean, rouge_median, rouge_std = report_rouge_scores(articles, references, titles, k, methods=methods)

index =  ['1-gram F1', '1-gram Precision', '1-gram Recall', 'bi-gram F1', 'bi-gram Precision', 'bi-gram Recall', 
          'longest common F1', 'longest common Precision', 'longest common Recall']

print('=' * 22 + ' Mean ' + '=' * 22)
rouge_mean.index = index
display(rouge_mean)

print('=' * 21 + ' Median ' + '=' * 21)
rouge_median.index = index
display(rouge_median)

print('=' * 15 + ' Standard Deviation ' + '=' * 15)
rouge_std.index = index
display(rouge_std)



Unnamed: 0,SMRS,tfidf,embed
1-gram F1,0.179505,0.263315,0.218495
1-gram Precision,0.281264,0.274434,0.358688
1-gram Recall,0.144435,0.279623,0.171098
bi-gram F1,0.087882,0.140143,0.108683
bi-gram Precision,0.142786,0.144967,0.178318
bi-gram Recall,0.069078,0.152341,0.085845
longest common F1,0.152074,0.232628,0.178858
longest common Precision,0.273157,0.259572,0.34212
longest common Recall,0.140186,0.264018,0.164711




Unnamed: 0,SMRS,tfidf,embed
1-gram F1,0.141038,0.278523,0.187397
1-gram Precision,0.211939,0.246212,0.321092
1-gram Recall,0.103632,0.312274,0.146295
bi-gram F1,0.034195,0.138591,0.06456
bi-gram Precision,0.078869,0.109548,0.122208
bi-gram Recall,0.027418,0.168254,0.041008
longest common F1,0.107183,0.231345,0.139431
longest common Precision,0.197094,0.221801,0.316627
longest common Recall,0.098825,0.296142,0.130337




Unnamed: 0,SMRS,tfidf,embed
1-gram F1,0.130616,0.11244,0.122054
1-gram Precision,0.186114,0.128875,0.158054
1-gram Recall,0.113721,0.123918,0.110763
bi-gram F1,0.109928,0.119751,0.12237
bi-gram Precision,0.172404,0.134194,0.163141
bi-gram Recall,0.08887,0.13177,0.103271
longest common F1,0.126271,0.114007,0.124809
longest common Precision,0.189033,0.13258,0.152103
longest common Recall,0.114081,0.124489,0.112052


CPU times: user 2min 43s, sys: 8.74 s, total: 2min 52s
Wall time: 3min 51s


### Word Embedding ROUGE Score Across Documents

In [None]:
%%time
rouge_mean_embed, rouge_median_embed, rouge_std_embed = report_rouge_scores(articles, references, titles, k, 
                                                                            rouge_embed=True, methods=methods)
index =  ['1-gram F1', '1-gram Precision', '1-gram Recall', 'bi-gram F1', 'bi-gram Precision', 'bi-gram Recall']

print('=' * 22 + ' Mean ' + '=' * 22)
rouge_mean_embed.index = index
display(rouge_mean_embed)

print('=' * 21 + ' Median ' + '=' * 21)
rouge_median_embed.index = index
display(rouge_median_embed)

print('=' * 15 + ' Standard Deviation ' + '=' * 15)
rouge_std_embed.index = index
display(rouge_std_embed)