Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
  • 2 commits
  • 4 files changed
  • 0 commit comments
  • 1 contributor
View
2  src/breadability/document.py
@@ -1,3 +1,5 @@
+"""Generate a clean nice starting html document to process for an article."""
+
import chardet
import logging
import re
View
39 src/breadability/readable.py
@@ -12,6 +12,7 @@
from breadability.scoring import is_unlikely_node
from breadability.utils import cached_property
+
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True,
@@ -104,7 +105,7 @@ def check_siblings(candidate_node, candidate_list):
append = True
# Give a bonus if sibling nodes and top candidates have the example
- # same classname
+ # same class name
if candidate_css and sibling.get('class') == candidate_css:
content_bonus += candidate_node.content_score * 0.2
@@ -148,9 +149,8 @@ def prep_article(doc):
- extra tags
"""
-
def clean_document(node):
- """Remove the style attribute on every element."""
+ """Clean up the final document we return as the readable article"""
clean_list = ['object', 'h1']
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
@@ -159,9 +159,10 @@ def clean_document(node):
if len(node.findall('.//h2')) == 1:
clean_list.append('h2')
- for n in node.getiterator():
+ for n in node.iter():
# clean out any incline style properties
- n.set('style', '')
+ if 'style' in n.attrib:
+ n.set('style', '')
# remove all of the following tags
# Clean a node of all elements of type "tag".
@@ -188,7 +189,9 @@ def clean_document(node):
# if the heading has no css weight or a high link density,
# remove it
if get_class_weight(n) < 0 or get_link_density(n) > .33:
- n.drop_tree()
+ # for some reason we get nodes here without a parent
+ if n.getparent() is not None:
+ n.drop_tree()
# clean out extra <p>
if n.tag == 'p':
@@ -202,36 +205,26 @@ def clean_document(node):
def clean_conditionally(doc, clean_el):
"""Remove the clean_el if it looks like bad content based on rules."""
- def clean_objects():
- pass
-
doc = clean_document(doc)
return doc
-def process(doc):
- """Process this doc to make it readable.
+def find_candidates(doc):
+ """Find cadidate nodes for the readable version of the article.
Here's we're going to remove unlikely nodes, find scores on the rest, and
clean up and return the final best match.
"""
- unlikely = []
scorable_node_tags = ['p', 'td', 'pre']
nodes_to_score = []
- for node in doc.getiterator():
+ for node in doc.iter():
if is_unlikely_node(node):
- unlikely.append(node)
-
- if node.tag in scorable_node_tags:
+ node.drop_tree()
+ elif node.tag in scorable_node_tags:
nodes_to_score.append(node)
-
- # process our clean up instructions
- [n.drop_tree() for n in unlikely]
-
- candidates = score_candidates(nodes_to_score)
- return candidates
+ return score_candidates(nodes_to_score)
class Article(object):
@@ -254,7 +247,7 @@ def readable(self):
html_cleaner(doc)
doc = drop_tag(doc, 'noscript')
doc = transform_misused_divs_into_paragraphs(doc)
- candidates = process(doc)
+ candidates = find_candidates(doc)
if candidates:
# right now we return the highest scoring candidate content
View
8 src/breadability/scoring.py
@@ -1,3 +1,6 @@
+"""Handle dealing with scoring nodes and content for our parsing."""
+import re
+
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = set([
@@ -32,7 +35,7 @@ def get_link_density(node):
:returns float:
"""
- link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
+ link_length = sum([len(a.text_content()) or 0 for a in node.findall(".//a")])
text_length = len(node.text_content())
return float(link_length) / max(text_length, 1)
@@ -152,6 +155,3 @@ def __init__(self, node):
content_score = -5
content_score += get_class_weight(node)
self.content_score = content_score
-
-
-
View
42 src/breadability/tests/test_scoring.py
@@ -0,0 +1,42 @@
+from lxml.html import fragment_fromstring
+from unittest import TestCase
+
+from breadability.scoring import check_node_attr
+
+
+class TestCheckNodeAttr(TestCase):
+ """Verify a node has a class/id in the given set.
+
+ The idea is that we have sets of known good/bad ids and classes and need
+ to verify the given node does/doesn't have those classes/ids.
+
+ """
+ def test_has_class(self):
+ """Verify that a node has a class in our set."""
+ test_set = set(['test1', 'test2'])
+ test_node = fragment_fromstring('<div/>')
+ test_node.set('class', 'test2 comment')
+
+ self.assertTrue(check_node_attr(test_node, 'class', test_set))
+
+ def test_has_id(self):
+ """Verify that a node has an id in our set."""
+ test_set = set(['test1', 'test2'])
+ test_node = fragment_fromstring('<div/>')
+ test_node.set('id', 'test2')
+
+ self.assertTrue(check_node_attr(test_node, 'id', test_set))
+
+ def test_lacks_class(self):
+ """Verify that a node does not have a class in our set."""
+ test_set = set(['test1', 'test2'])
+ test_node = fragment_fromstring('<div/>')
+ test_node.set('class', 'test4 comment')
+ self.assertFalse(check_node_attr(test_node, 'class', test_set))
+
+ def test_lacks_id(self):
+ """Verify that a node does not have an id in our set."""
+ test_set = set(['test1', 'test2'])
+ test_node = fragment_fromstring('<div/>')
+ test_node.set('id', 'test4')
+ self.assertFalse(check_node_attr(test_node, 'id', test_set))

No commit comments for this range

Something went wrong with that request. Please try again.