Permalink
Browse files

Work on tweaking out parser algorithm to help find the right candidat…

…e: fixes #2
  • Loading branch information...
1 parent b78ea49 commit 9e6835bd922a36e73a2ee049a84c74ec31329a38 @mitechie mitechie committed May 7, 2012
Showing with 42 additions and 37 deletions.
  1. +21 −14 src/breadability/readable.py
  2. +21 −23 src/breadability/scoring.py
View
35 src/breadability/readable.py
@@ -223,7 +223,11 @@ def clean_document(node):
continue
# finally try out the conditional cleaning of the target node
- clean_conditionally(n)
+ if clean_conditionally(n):
+ # For some reason the parent is none so we can't drop, we're
+ # not in a tree that can take dropping this node.
+ if n.getparent() is not None:
+ n.drop_tree()
return node
@@ -242,7 +246,7 @@ def clean_conditionally(node):
if (weight + content_score < 0):
LOG.debug('Dropping conditional node: ' + str(node))
- node.drop_tree()
+ return True
if node.text_content().count(',') < 10:
LOG.debug("There aren't 10 ,s so we're processing more")
@@ -269,7 +273,7 @@ def clean_conditionally(node):
# this one has shown to do some extra image removals.
# we could get around this by checking for caption info in the
# images to try to do some scoring of good v. bad images.
- # failing example:
+ # failing example:
# arstechnica.com/science/news/2012/05/1859s-great-auroral-stormthe-week-the-sun-touched-the-earth.ars
LOG.debug('Conditional drop: img > p')
remove_node = True
@@ -291,12 +295,10 @@ def clean_conditionally(node):
elif (embed == 1 and content_length < 75) or embed > 1:
LOG.debug('Conditional drop: embed without much content or many embed')
remove_node = True
+ return remove_node
- if remove_node:
- # For some reason the parent is none so we can't drop, we're
- # not in a tree that can take dropping this node.
- if node.getparent() is not None:
- node.drop_tree()
+ # nope, don't remove anything
+ return False
doc = clean_document(doc)
return doc
@@ -309,16 +311,18 @@ def find_candidates(doc):
clean up and return the final best match.
"""
- scorable_node_tags = ['p', 'td', 'pre']
+ scorable_node_tags = ['div', 'p', 'td', 'pre']
nodes_to_score = []
+ should_remove = []
for node in doc.iter():
if is_unlikely_node(node):
- LOG.debug('Dropping unlikely: ' + str(node))
- node.drop_tree()
- elif node.tag in scorable_node_tags:
+ LOG.debug('We should drop unlikely: ' + str(node))
+ should_remove.append(node)
+ continue
+ if node.tag in scorable_node_tags:
nodes_to_score.append(node)
- return score_candidates(nodes_to_score)
+ return score_candidates(nodes_to_score), should_remove
class Article(object):
@@ -342,7 +346,7 @@ def readable(self):
html_cleaner(doc)
doc = drop_tag(doc, 'noscript')
doc = transform_misused_divs_into_paragraphs(doc)
- candidates = find_candidates(doc)
+ candidates, should_drop = find_candidates(doc)
if candidates:
LOG.debug('Candidates found:')
@@ -364,6 +368,9 @@ def readable(self):
else:
LOG.warning('No candidates found: using document.')
LOG.debug('Begin final prep of article')
+ # since we've not found a good candidate we're should help this
+ # cleanup by removing the should_drop we spotted.
+ [n.drop_tree() for n in should_drop]
doc = prep_article(doc)
doc = build_base_document(doc)
View
44 src/breadability/scoring.py
@@ -1,28 +1,25 @@
"""Handle dealing with scoring nodes and content for our parsing."""
import re
+from breadability.logconfig import LOG
+
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
-CLS_UNLIKELY = set([
- 'combx', 'comment', 'community', 'disqus', 'extra', 'foot', 'header',
- 'menu', '' 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break',
- 'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter',
-])
-CLS_MAYBE = set([
- 'and', 'article', 'body', 'column', 'main', 'shadow',
-])
-CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
- 'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
-CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
- 'footer', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
- 'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
- 'tool', 'widget'])
+CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
+ 'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
+ 'pager|popup|tweet|twitter'), re.I)
+CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
+CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
+ 'page|pagination|post|text|blog|story'), re.I)
+CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
+ 'footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
+ 'sidebar|sponsor|shopping|tags|tool|widget'), re.I)
def check_node_attr(node, attr, checkset):
- attr = node.get(attr) or ""
- check = set(attr.lower().split(' '))
- if check.intersection(checkset):
+ value = node.get(attr) or ""
+ check = checkset.search(value)
+ if check:
return True
else:
return False
@@ -88,13 +85,15 @@ def score_candidates(nodes):
content_score = 0
parent = node.getparent()
grand = parent.getparent() if parent is not None else None
- innertext = node.text
+ innertext = node.text_content()
if parent is None or grand is None:
+ LOG.debug("Skipping candidate because parent/grand are none")
continue
# If this paragraph is less than 25 characters, don't even count it.
if innertext and len(innertext) < MIN_HIT_LENTH:
+ LOG.debug("Skipping candidate because not enough content.")
continue
# Initialize readability data for the parent.
@@ -116,11 +115,10 @@ def score_candidates(nodes):
length_points = len(innertext) % 100 if innertext else 0
content_score = length_points if length_points > 3 else 3
- # Add the score to the parent. The grandparent gets half. */
- if parent is not None:
- candidates[parent].content_score += content_score
- if grand is not None:
- candidates[grand].content_score += content_score
+ # Add the score to the parent.
+ candidates[parent].content_score += content_score
+ # The grandparent gets half.
+ candidates[grand].content_score += content_score / 2.0
for candidate in candidates.values():
candidate.content_score = candidate.content_score * (1 -

0 comments on commit 9e6835b

Please sign in to comment.