Merge pull request #82 from alan-turing-institute/deal-with-nested-bl…

…ocks Deal with nested blocks
alan-turing-institute · Aug 5, 2019 · fd59b60 · fd59b60
2 parents 06f2046 + fe38bda
commit fd59b60
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 20 deletions.
diff --git a/readabilipy/simple_tree.py b/readabilipy/simple_tree.py
@@ -1,6 +1,6 @@
 """Turn input HTML into a cleaned parsed tree."""
 from bs4 import BeautifulSoup
-from .simplifiers.html import consolidate_text, insert_paragraph_breaks, normalise_strings, process_special_elements, process_unknown_elements, recursively_prune_elements, remove_blacklist, remove_empty_strings_and_elements, remove_metadata, strip_attributes, structural_elements, unwrap_elements, wrap_bare_text
+from .simplifiers.html import consolidate_text, insert_paragraph_breaks, normalise_strings, process_special_elements, process_unknown_elements, recursively_prune_elements, remove_blacklist, remove_empty_strings_and_elements, remove_metadata, strip_attributes, structural_elements, unnest_paragraphs, unwrap_elements, wrap_bare_text
 
 
 def simple_tree_from_html_string(html):
@@ -36,6 +36,9 @@ def simple_tree_from_html_string(html):
     # Remove empty string elements
     remove_empty_strings_and_elements(soup)
 
+    # Split out block-level elements illegally contained inside paragraphs
+    unnest_paragraphs(soup)
+
     # Replace <br> and <hr> elements with paragraph breaks
     # Must come after remove_empty_strings_and_elements so that consecutive <br>s can be identified
     # Re-consolidates strings at the end, so must come before normalise_strings

diff --git a/readabilipy/simplifiers/html.py b/readabilipy/simplifiers/html.py
@@ -152,6 +152,33 @@ def remove_empty_strings_and_elements(soup):
             element.extract()
 
 
+def unnest_paragraphs(soup):
+    """Split out block-level elements illegally contained inside paragraphs."""
+    illegal_elements = ["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset",
+                        "figcaption", "figure", "footer", "form", "h1>-<h6", "header", "hr", "li", "main", "nav",
+                        "noscript", "ol", "p", "pre", "section", "table", "tfoot", "ul", "video"]
+    for nested_type in illegal_elements:
+        # Search for nested elements that need to be split out
+        nested_elements = [e for e in soup.find_all('p') if e.find(nested_type)]
+        while nested_elements:
+            # Separate this element into the nested element, plus before and after
+            elem_nested = nested_elements[0].find(nested_type)
+            p_before = soup.new_tag("p")
+            for sibling in list(elem_nested.previous_siblings):
+                p_before.append(sibling)
+            p_after = soup.new_tag("p")
+            for sibling in list(elem_nested.next_siblings):
+                p_after.append(sibling)
+            # Replace element by before/nested/after.
+            # NB. this is done in reverse order as we are adding after the current position
+            nested_elements[0].insert_after(p_after)
+            nested_elements[0].insert_after(elem_nested)
+            nested_elements[0].insert_after(p_before)
+            nested_elements[0].decompose()
+            # Rerun search for nested elements now that we have rewritten the tree
+            nested_elements = [e for e in soup.find_all('p') if e.find(nested_type)]
+
+
 def insert_paragraph_breaks(soup):
     """Identify <br> and <hr> and split their parent element into multiple elements where appropriate."""
     # Indicator which is used as a placeholder to mark paragraph breaks
@@ -167,7 +194,7 @@ def insert_paragraph_breaks(soup):
 
             # If there's only one <br> then we replace it with a space
             if len(br_element_chain) == 1:
-                br_element_chain[0].replace_with(" ")
+                br_element_chain[0].replace_with(' ')
             # If there are multiple <br>s then replace them with BREAK_INDICATOR
             else:
                 br_element_chain[0].replace_with(BREAK_INDICATOR)
@@ -199,11 +226,8 @@ def insert_paragraph_breaks(soup):
                     new_p_element = soup.new_tag("p")
                     new_p_element.string = text_fragment
                     parent_element.insert_after(new_p_element)
-                # Replace the parent string if it exists or add one if not
-                if parent_element.string:
-                    parent_element.string.replace_with(text_fragments[0])
-                else:
-                    parent_element.string = text_fragments[0]
+                # Replace this element by a navigable string containing the first text fragment
+                element.replace_with(NavigableString(text_fragments[0]))
             # Otherwise we want to simply include all the text fragments as independent NavigableStrings (that will be wrapped later)
             else:
                 # Iterate in reverse order as we are repeatedly adding new elements directly after the original one

diff --git a/tests/checks.py b/tests/checks.py
@@ -5,8 +5,8 @@
 from ..readabilipy.simple_json import extract_text_blocks_as_plain_text
 
 
-def check_exact_html_output(test_fragment, expected_output=None):
-    """Check that expected output is present when parsing HTML fragment."""
+def get_normalised_html_output(test_fragment, expected_output=None):
+    """Get normalised HTML output."""
     if expected_output is None:
         expected_output = test_fragment
     article_json = simple_json_from_html_string(test_fragment)
@@ -16,18 +16,18 @@ def check_exact_html_output(test_fragment, expected_output=None):
     normalised_result = strip_html_whitespace(content)
     print("expectation:", normalised_expectation)
     print("result:", normalised_result)
+    return (normalised_expectation, normalised_result)
+
+
+def check_exact_html_output(test_fragment, expected_output=None):
+    """Check that expected output is present when parsing HTML fragment."""
+    normalised_expectation, normalised_result = get_normalised_html_output(test_fragment, expected_output)
     assert normalised_expectation == normalised_result
 
 
 def check_html_output_contains_text(test_fragment, expected_output=None):
     """Check that expected output is present when parsing HTML fragment."""
-    if expected_output is None:
-        expected_output = test_fragment
-    article_json = simple_json_from_html_string(test_fragment)
-    content = str(article_json["plain_content"])
-    # Check that expected output is present after simplifying the HTML
-    normalised_expectation = strip_html_whitespace(expected_output)
-    normalised_result = strip_html_whitespace(content)
+    normalised_expectation, normalised_result = get_normalised_html_output(test_fragment, expected_output)
     assert normalised_expectation in normalised_result
 
 

diff --git a/tests/test_weird_html.py b/tests/test_weird_html.py
@@ -67,7 +67,7 @@ def test_paragraph_splitting_with_unclosed_tags():
     )
 
 
-# Test nested superscript
+# Test (possibly illegal) nested elements
 def test_nested_superscript():
     """Ensure that nested superscripts are correctly parsed."""
     check_exact_html_output(
@@ -76,16 +76,14 @@ def test_nested_superscript():
     )
 
 
-# Test linebreaks inside superscript
-def test_linebreaks_inside_superscript():
+def test_nested_linebreaks_inside_superscript():
     """Ensure that linebreaks inside superscript are correctly parsed."""
     check_exact_html_output(
         "<p>Some text <sup>with<br/>superscripts</sup> that should be joined.</p>",
         "<div><p>Some text ^with superscripts that should be joined.</p></div>"
     )
 
 
-# Test nested superscript with linebreaks
 def test_nested_superscript_with_linebreaks():
     """Ensure that nested superscripts with linebreaks are correctly parsed."""
     check_exact_html_output(
@@ -96,3 +94,31 @@ def test_nested_superscript_with_linebreaks():
         </p>""",
         "<div><p>Some text with linebreaks ^ ^around a footnote.</p></div>"
     )
+
+
+def test_nested_table_inside_paragraph():
+    """Ensure that blocks (illegally) nested inside paragraphs are split out."""
+    check_exact_html_output(
+        """
+        <p>
+            First paragraph.
+            <br/><br/>
+            <table>
+                <tbody>
+                    <tr>
+                        <td>Table text.</td>
+                    </tr>
+                </tbody>
+            </table>
+            Second paragraph.
+        </p>""",
+        "<div><p>First paragraph.</p><table><tbody><tr><td>Table text.</td></tr></tbody></table><p>Second paragraph.</p></div>"
+    )
+
+
+def test_nested_span_inside_paragraph():
+    """Ensure that spans nested inside paragraphs are kept in."""
+    check_exact_html_output(
+        "<p>Some text <span>in a span</span> that should stay together.</p>""",
+        "<div><p>Some text in a span that should stay together.</p></div>"
+    )