In [7]:
# Initial imports
import pickle
from pathlib import Path
from typing import List, Dict

In [2]:
# globals
SOURCES = Path('cnn') / 'stories'

In [3]:
def load_corpus(a_path: Path) -> List[Dict]:
    """Loads the entire corpus of stories into memory as a list of dictionaries."""
    resp = []
    for f_path in a_path.glob('*.story'):
        resp.append(load_story(f_path))
    return resp

def load_story(f_path: Path) -> Dict:
    """Load a story into a dictionary.
    A story has some lines, separated by empty lines, followed by highlights.
    Highlights are structured with a @highlight keyword, followed by some lines.
    This method will throw an exception if the story has no body.
    """
    resp = dict(body=[], summaries=[], id=f_path.name)
    with f_path.open() as fh:
        reading_main = True
        for line in fh.readlines():
            # remove leading and trailing white space - i.e. assume it is not important
            line = line.strip()
            # skip empty lines
            if not line:
                continue
            if line == '@highlight':
                reading_main = False
                current_highlight = []
                resp['summaries'].append(current_highlight)
                continue
            if reading_main:
                resp['body'].append(line)
            else:
                current_highlight.append(line)
        return resp

In [5]:
corpus = load_corpus(SOURCES)

In [6]:
len(corpus)

92579

In [8]:
pickle.dump(corpus, open(SOURCES.parent / 'corpus.pkl', 'wb'))