# Cornell Newsroom Summarization Dataset

- Website: https://lil.nlp.cornell.edu/newsroom/index.html
- Paper: https://aclanthology.org/N18-1065/
- Repo: https://github.com/lil-lab/newsroom (how to read the dataset is explained in the readme)



| Name | Value |
| --- | --- |
| Dataset Size | 1,321,995 articles |
| Training Set Size | 995,041 articles |
| Mean Article Length | 658.6 words |
| Mean Summary Length | 26.7 words |
| Total Vocabulary Size | 6,925,712 words |
| Occurring 10+ Times | 784,884 words |


In [20]:
from pathlib import Path
import pandas as pd
import json
from newsroom import jsonl


def convert_newsroom_dataset(input_path, output_dir, output_prefix: str, max_rows_per_file:int=400000, compression:str="snappy"):
    part = 0
    text, summary, provenance = [], [], []
    empty_summaries = 0

    def write_part(with_part_postfix: bool):
        if with_part_postfix:
            fn = f'{output_prefix}_{part:05d}.{compression}.parquet'
        else:
            fn = f'{output_prefix}.{compression}.parquet'
        
        fn = output_dir / fn
        text_, summary_, provenance_ = map(lambda x: pd.array(x, dtype="string"), (text, summary, provenance))
        df = pd.DataFrame({"text": text_, "summary": summary_, "provenance": provenance_})
        print(f'writing: {fn} (rows: {len(df)}; empty summaries: {empty_summaries} (skipped))')
        df.to_parquet(
            fn, 
            engine="pyarrow",
            compression=compression
        )

    with jsonl.open(str(input_path), gzip = True) as f:
        for entry in f:
            
            s = entry['summary']
            if s is None or len(s.strip()) == 0:
                empty_summaries += 1
                continue

            text.append(entry['text'])
            summary.append(s)
            provenance.append(json.dumps({ 'src': output_prefix, 'url': entry['archive']}))
            if max_rows_per_file is not None and max_rows_per_file > 0 and len(text) > max_rows_per_file:
                write_part(True)
                part += 1
                text, summary, provenance = [], [], []

    if len(text) > 0:
        write_part(part > 0)

dataset_dir = Path('../newsroom-thin')
output_dir = Path('./data/newsroom/')
dataset_file_names = list(dataset_dir.glob('*.dataset'))
output_prefixes = map(lambda fn: 'newsroom_' + fn.stem, dataset_file_names)

#dataset_file_names = ['dev.dataset', 'test.dataset', 'train.dataset']
#output_prefixes = ['newsroom_dev', 'newsroom_test', 'newsroom_train']


In [21]:
# run conversion:
output_dir.mkdir(parents=True, exist_ok=True)
for fn, prefix in zip(dataset_file_names, output_prefixes):
    convert_newsroom_dataset(fn, output_dir, prefix)

writing: data/newsroom/newsroom_dev.snappy.parquet (rows: 108599; empty summaries: 185 (skipped))
writing: data/newsroom/newsroom_test.snappy.parquet (rows: 108670; empty summaries: 152 (skipped))


In [19]:
# check files in output dir

def mean_strlen(col: pd.Series):
    return col.apply(len).mean()

def count_words(s):
    return sum(1 for w in s.split(' ') if len(w) > 0)

def mean_wordcount(col: pd.Series):
    return col.apply(count_words).mean()


for fn in output_dir.glob('*.parquet'):
    df = pd.read_parquet(path=str(fn), engine='pyarrow')
    text, summary = df["text"], df["summary"]
    print(f'file: "{fn.name}"; rows: {len(df)}; mean_wordcount: {{ text: {mean_wordcount(text):.2f}; summary: {mean_wordcount(summary):.2f}; }}; mean_strlen: {{ text: {mean_strlen(text):.2f}; summary: {mean_strlen(summary):.2f} }};')

#     df = pd.read_parquet(path=str(fn), engine='pyarrow')
#     print('rows:', len(df))
#     print(df.head(2))


file: "newsroom_dev.snappy.parquet"; rows: 108599; mean_wordcount: { text: 612.10; summary: 26.93; }; mean_strlen: { text: 3764.11; summary: 163.75 };
file: "newsroom_test.snappy.parquet"; rows: 108670; mean_wordcount: { text: 615.35; summary: 26.85; }; mean_strlen: { text: 3784.15; summary: 163.29 };
