# Dataset for generating TL;DR

Mylo had already converted the dataset to a text+summary parquet. Add minimal provenance information.

Website: https://zenodo.org/record/1168855#.Y4dUvDPMIUH

In [7]:
from pathlib import Path
import json
import pandas as pd

input_dir = Path('../tldr-challenge/')
input_file_names = ['df1.parquet', 'df2.parquet']

compression = "snappy"
output_prefix = 'tldr-challenge'
output_dir = Path('./data/tldr-challenge/')

output_dir.mkdir(parents=True, exist_ok=True)

for part,fn in enumerate(input_file_names):
    print('reading: ', fn)
    df = pd.read_parquet(path=str(input_dir / fn), engine='pyarrow', columns=['text', 'summary'])
    print(f'{len(df)} rows read.')

    # add minimal provenance info
    provenance = pd.array([json.dumps({ 'src': output_prefix, 'doi': '10.5281/zenodo.1043504' })] * len(df), dtype="string")
    df["provenance"] = provenance

    fn = f'{output_prefix}_{part:05d}.{compression}.parquet'  
    fn = output_dir / fn
    print('writing: ', fn)
    df.to_parquet(
        fn, 
        engine="pyarrow",
        compression=compression
    )
    print(f'{len(df)} rows written.')
    df = None

reading:  df1.parquet
1542205 rows read.
writing:  data/tldr-challenge/tldr-challenge_00000.snappy.parquet
1542205 rows written.
reading:  df2.parquet
1542205 rows read.
writing:  data/tldr-challenge/tldr-challenge_00001.snappy.parquet
1542205 rows written.


In [8]:
# check files in output dir

def mean_strlen(col: pd.Series):
    return col.apply(len).mean()

def count_words(s):
    return sum(1 for w in s.split(' ') if len(w) > 0)

def mean_wordcount(col: pd.Series):
    return col.apply(count_words).mean()


for fn in output_dir.glob('*.parquet'):
    df = pd.read_parquet(path=str(fn), engine='pyarrow')
    text, summary = df["text"], df["summary"]
    print(f'file: "{fn.name}"; rows: {len(df)}; mean_wordcount: {{ text: {mean_wordcount(text):.2f}; summary: {mean_wordcount(summary):.2f}; }}; mean_strlen: {{ text: {mean_strlen(text):.2f}; summary: {mean_strlen(summary):.2f} }};')


file: "tldr-challenge_00000.snappy.parquet"; rows: 1542205; mean_wordcount: { text: 207.96; summary: 24.17; }; mean_strlen: { text: 1116.41; summary: 132.31 };
file: "tldr-challenge_00001.snappy.parquet"; rows: 1542205; mean_wordcount: { text: 219.33; summary: 27.98; }; mean_strlen: { text: 1170.94; summary: 151.76 };
