Skip to content

Commit

Permalink
Profile and adjust for performance, add bugfix to parse out mitechie …
Browse files Browse the repository at this point in the history
…blog post
  • Loading branch information
mitechie committed May 6, 2012
1 parent 6b16b7b commit e7873d3
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 22 deletions.
2 changes: 2 additions & 0 deletions src/breadability/document.py
@@ -1,3 +1,5 @@
"""Generate a clean nice starting html document to process for an article."""

import chardet
import logging
import re
Expand Down
33 changes: 15 additions & 18 deletions src/breadability/readable.py
Expand Up @@ -12,6 +12,7 @@
from breadability.scoring import is_unlikely_node
from breadability.utils import cached_property


html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True,
Expand Down Expand Up @@ -149,7 +150,7 @@ def prep_article(doc):
"""
def clean_document(node):
"""Remove the style attribute on every element."""
"""Clean up the final document we return as the readable article"""
clean_list = ['object', 'h1']
keep_keywords = ['youtube', 'blip.tv', 'vimeo']

Expand All @@ -158,9 +159,10 @@ def clean_document(node):
if len(node.findall('.//h2')) == 1:
clean_list.append('h2')

for n in node.getiterator():
for n in node.iter():
# clean out any incline style properties
n.set('style', '')
if 'style' in n.attrib:
n.set('style', '')

# remove all of the following tags
# Clean a node of all elements of type "tag".
Expand All @@ -187,7 +189,9 @@ def clean_document(node):
# if the heading has no css weight or a high link density,
# remove it
if get_class_weight(n) < 0 or get_link_density(n) > .33:
n.drop_tree()
# for some reason we get nodes here without a parent
if n.getparent() is not None:
n.drop_tree()

# clean out extra <p>
if n.tag == 'p':
Expand All @@ -205,29 +209,22 @@ def clean_conditionally(doc, clean_el):
return doc


def process(doc):
"""Process this doc to make it readable.
def find_candidates(doc):
"""Find cadidate nodes for the readable version of the article.
Here's we're going to remove unlikely nodes, find scores on the rest, and
clean up and return the final best match.
"""
unlikely = []
scorable_node_tags = ['p', 'td', 'pre']
nodes_to_score = []

for node in doc.getiterator():
for node in doc.iter():
if is_unlikely_node(node):
unlikely.append(node)

if node.tag in scorable_node_tags:
node.drop_tree()
elif node.tag in scorable_node_tags:
nodes_to_score.append(node)

# process our clean up instructions
[n.drop_tree() for n in unlikely]

candidates = score_candidates(nodes_to_score)
return candidates
return score_candidates(nodes_to_score)


class Article(object):
Expand All @@ -250,7 +247,7 @@ def readable(self):
html_cleaner(doc)
doc = drop_tag(doc, 'noscript')
doc = transform_misused_divs_into_paragraphs(doc)
candidates = process(doc)
candidates = find_candidates(doc)

if candidates:
# right now we return the highest scoring candidate content
Expand Down
8 changes: 4 additions & 4 deletions src/breadability/scoring.py
@@ -1,3 +1,6 @@
"""Handle dealing with scoring nodes and content for our parsing."""
import re

# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = set([
Expand Down Expand Up @@ -32,7 +35,7 @@ def get_link_density(node):
:returns float:
"""
link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
link_length = sum([len(a.text_content()) or 0 for a in node.findall(".//a")])
text_length = len(node.text_content())
return float(link_length) / max(text_length, 1)

Expand Down Expand Up @@ -152,6 +155,3 @@ def __init__(self, node):
content_score = -5
content_score += get_class_weight(node)
self.content_score = content_score



0 comments on commit e7873d3

Please sign in to comment.