In [1]:
from readability.readability import Unparseable
from readability.readability import Document as Paper

def html(self, fileids=None, categories=None):
    """
    Returns the HTML content of each document, cleaning it using
    the readability-lxml library.
    """
    for doc in self.docs(fileids, categories):
        try:
            yield Paper(doc).summary()
        except Unparseable as e:
            print("Could not parse HTML: {}".format(e))
            continue

In [2]:
import logging
log = logging.getLogger("readability.readability")
log.setLevel('WARNING')

In [3]:
import bs4

# Tags to extract as paragraphs from the HTML text
tags = [
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'
]

def paras(self, fileids=None, categories=None):
    """
    Uses BeautifulSoup to parse the paragraphs from the HTML.
    """
    for html in self.html(fileids, categories):
        soup = bs4.BeautifulSoup(html, 'lxml')
        for element in soup.find_all(tags):
            yield element.text
        soup.decompose()

In [4]:
from nltk import sent_tokenize

def sents(self, fileids=None, categories=None):
    """
    Uses the built in sentence tokenizer to extract sentences from the
    paragraphs. Note that this method uses BeautifulSoup to parse HTML.
    """

    for paragraph in self.paras(fileids, categories):
        for sentence in sent_tokenize(paragraph):
            yield sentence

In [5]:
from nltk import wordpunct_tokenize

def words(self, fileids=None, categories=None):
    """
    Uses the built-in word tokenizer to extract tokens from sentences.
    Note that this method uses BeautifulSoup to parse HTML content.
    """
    for sentence in self.sents(fileids, categories):
        for token in wordpunct_tokenize(sentence):
            yield token

In [6]:
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize

def tokenize(self, fileids=None, categories=None):
    """
    Segments, tokenizes, and tags a document in the corpus.
    """
    for paragraph in self.paras(fileids=fileids):
        yield [
            pos_tag(wordpunct_tokenize(sent))
            for sent in sent_tokenize(paragraph)
        ]

In [7]:
import time
import nltk

def describe(self, fileids=None, categories=None):
    """
    Performs a single pass of the corpus and
    returns a dictionary with a variety of metrics
    concerning the state of the corpus.
    """
    started = time.time()

    # Structures to perform counting.
    counts  = nltk.FreqDist()
    tokens  = nltk.FreqDist()

    # Perform single pass over paragraphs, tokenize and count
    for para in self.paras(fileids, categories):
        counts['paras'] += 1

        for sent in para:
            counts['sents'] += 1

            for word, tag in sent:
                counts['words'] += 1
                tokens[word] += 1

    # Compute the number of files and categories in the corpus
    n_fileids = len(self.resolve(fileids, categories) or self.fileids())
    n_topics  = len(self.categories(self.resolve(fileids, categories)))

    # Return data structure with information
    return {
        'files':  n_fileids,
        'topics': n_topics,
        'paras':  counts['paras'],
        'sents':  counts['sents'],
        'words':  counts['words'],
        'vocab':  len(tokens),
        'lexdiv': float(counts['words']) / float(len(tokens)),
        'ppdoc':  float(counts['paras']) / float(n_fileids),
        'sppar':  float(counts['sents']) / float(counts['paras']),
        'secs':   time.time() - started,
    }

In [8]:
import os

class Preprocessor(object):
    """
    The preprocessor wraps an `HTMLCorpusReader` and performs tokenization
    and part-of-speech tagging.
    """
    def __init__(self, corpus, target=None, **kwargs):
        self.corpus = corpus
        self.target = target

    def fileids(self, fileids=None, categories=None):
        fileids = self.corpus.resolve(fileids, categories)
        if fileids:
            return fileids
        return self.corpus.fileids()

    def abspath(self, fileid):
        # Find the directory, relative to the corpus root.
        parent = os.path.relpath(
            os.path.dirname(self.corpus.abspath(fileid)), self.corpus.root
        )

        # Compute the name parts to reconstruct
        basename  = os.path.basename(fileid)
        name, ext = os.path.splitext(basename)

        # Create the pickle file extension
        basename  = name + '.pickle'

        # Return the path to the file relative to the target.
        return os.path.normpath(os.path.join(self.target, parent, basename))

In [9]:
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize

def tokenize(self, fileid):
    for paragraph in self.corpus.paras(fileids=fileid):
        yield [
            pos_tag(wordpunct_tokenize(sent))
            for sent in sent_tokenize(paragraph)
        ]

In [10]:
import pickle

def process(self, fileid):
    """
    For a single file, checks the location on disk to ensure no errors,
    uses +tokenize()+ to perform the preprocessing, and writes transformed
    document as a pickle to target location.
    """
    # Compute the outpath to write the file to.
    target = self.abspath(fileid)
    parent = os.path.dirname(target)

    # Make sure the directory exists
    if not os.path.exists(parent):
        os.makedirs(parent)

    # Make sure that the parent is a directory and not a file
    if not os.path.isdir(parent):
        raise ValueError(
            "Please supply a directory to write preprocessed data to."
        )

    # Create a data structure for the pickle
    document = list(self.tokenize(fileid))

    # Open and serialize the pickle to disk
    with open(target, 'wb') as f:
        pickle.dump(document, f, pickle.HIGHEST_PROTOCOL)

    # Clean up the document
    del document
    # Return the target fileid
    return target

In [11]:
def transform(self, fileids=None, categories=None):
    # Make the target directory if it doesn't already exist
    if not os.path.exists(self.target):
        os.makedirs(self.target)

    # Resolve the fileids to start processing
    for fileid in self.fileids(fileids, categories):
        yield self.process(fileid)

In [12]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

CAT_PATTERN = r'([a-z_\s]+)/.*'
DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

class HTMLCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    A corpus reader for raw HTML documents to enable preprocessing.
    """

    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',
                 tags=TAGS, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Save the tags that we specifically want to extract.
        self.tags = tags

In [13]:
import pickle

PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'

class PickledCorpusReader(HTMLCorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def docs(self, fileids=None, categories=None):
        fileids = self.resolve(fileids, categories)
        # Load one pickled document into memory at a time.
        for path in self.abspaths(fileids):
            with open(path, 'rb') as f:
                yield pickle.load(f)

In [14]:
def paras(self, fileids=None, categories=None):
    for doc in self.docs(fileids, categories):
        for para in doc:
            yield para

In [15]:
def sents(self, fileids=None, categories=None):
    for para in self.paras(fileids, categories):
        for sent in para:
            yield sent

In [16]:
def tagged(self, fileids=None, categories=None):
    for sent in self.sents(fileids, categories):
        for tagged_token in sent:
            yield tagged_token

def words(self, fileids=None, categories=None):
    for tagged in self.tagged(fileids, categories):
        yield tagged[0]