Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speedups #688

Merged
merged 2 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions tests/test_doc_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import json
import unittest
import warnings

from bs4 import BeautifulSoup
import httpretty
import requests

from canonicalwebteam.discourse_docs import DiscourseAPI
from webapp.doc_parser import FastDocParser

EXAMPLE_CONTENT = (
"<p>Some homepage content</p>"
"<h1>Navigation</h1>"
"<ul>"
'<li><a href="/t/page-a/10">Page A</a></li>'
'<li><a href="/t/b-page/12">B page</a></li>'
"</ul>"
"<h1>URLs</h1>"
'<details open="">'
"<summary>Mapping table</summary>"
'<div class="md-table">'
"<table>"
"<thead><tr>"
"<th>Topic</th><th>Path</th></tr></thead>"
"<tbody><tr>"
'<td><a href="https://discourse.example.com/t/'
'page-a/10">Page A</a></td>'
"<td>/a</td>"
"</tr><tr>"
'<td><a href="https://discourse.example.com/t/'
'page-z/26">Page Z</a></td>'
"<td>/page-z</td>"
"</tr></tbody></table>"
"</div></details>"
"<h1>Redirects</h1>"
'<details open="">'
"<summary>Mapping table</summary>"
'<div class="md-table">'
"<table>"
"<thead><tr>"
"<th>Topic</th><th>Path</th></tr></thead>"
"<tbody>"
"<tr><td>/redir-a</td><td>/a</td></tr>"
"<tr>"
" <td>/example/page</td>"
" <td>https://example.com/page</td>"
"</tr>"
"</tr></tbody></table>"
"</div></details>"
)


class TestFastDocParser(unittest.TestCase):
def setUp(self):
# Suppress annoying warnings from HTTPretty
# See: https://github.com/gabrielfalcao/HTTPretty/issues/368
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*"
)

# Enable HTTPretty and set up mock URLs
httpretty.enable()
self.addCleanup(httpretty.disable)
self.addCleanup(httpretty.reset)
# Index page with navigation, URL map and redirects
httpretty.register_uri(
httpretty.GET,
"https://discourse.example.com/t/34.json",
body=json.dumps(
{
"id": 34,
"category_id": 2,
"title": "An index page",
"slug": "an-index-page",
"post_stream": {
"posts": [
{
"id": 3434,
"cooked": EXAMPLE_CONTENT,
"updated_at": "2018-10-02T12:45:44.259Z",
}
]
},
}
),
content_type="application/json",
)

discourse_api = DiscourseAPI(
base_url="https://discourse.example.com/",
session=requests.Session(),
)

self.parser = FastDocParser(
api=discourse_api,
index_topic_id=34,
url_prefix="/",
)
self.parser.parse()

def test_index_has_no_nav(self):
index = self.parser.index_document
soup = BeautifulSoup(index["body_html"], features="lxml")

# Check body
self.assertEqual(soup.p.string, "Some homepage content")

# Check navigation
self.assertIsNone(soup.h1)

# Check URL map worked
self.assertIsNone(soup.details)
self.assertNotIn(
'<a href="/t/page-a/10">Page A</a>',
soup.decode_contents(),
)

def test_nav(self):
navigation = self.parser.navigation

self.assertIn(
'<li><a href="/t/b-page/12">B page</a></li>',
navigation,
)

self.assertIn('<a href="/a">Page A</a>', navigation)

def test_redirect_map(self):
self.assertEqual(
self.parser.redirect_map,
{"/redir-a": "/a", "/example/page": "https://example.com/page"},
)

self.assertEqual(self.parser.warnings, [])

def test_url_map(self):
self.assertEqual(
self.parser.url_map,
{
10: "/a",
26: "/page-z",
34: "/",
"/": 34,
"/a": 10,
"/page-z": 26,
},
)

self.assertEqual(self.parser.warnings, [])


if __name__ == "__main__":
unittest.main()
116 changes: 100 additions & 16 deletions webapp/doc_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Standard library
from canonicalwebteam.discourse_docs import DocParser
from canonicalwebteam.discourse_docs.parsers import TOPIC_URL_MATCH

import re
from urllib.parse import urlparse

# Packages
import dateutil.parser
Expand Down Expand Up @@ -39,11 +41,10 @@ def parse(self):
self.index_document = self.parse_topic(
index_topic, topic_soup=raw_index_soup
)
index_soup = BeautifulSoup(
self.index_document["body_html"], features="lxml"
)
self.index_document["body_html"] = str(
self._get_preamble(index_soup, break_on_title="Navigation")
index_soup = self.index_document.pop("body_soup")

self.index_document["body_html"] = self._get_preamble(
index_soup, break_on_title="Navigation"
)

# Parse navigation
Expand All @@ -56,6 +57,91 @@ def parse(self):
self._replace_links(raw_index_soup, topics)
)

def _parse_url_map(self, index_soup):
"""
Given the HTML soup of an index topic
extract the URL mappings from the "URLs" section.

The URLs section should contain a table of
"Topic" to "Path" mappings
(extra markup around this table doesn't matter)
e.g.:

<h1>URLs</h1>
<details>
<summary>Mapping table</summary>
<table>
<tr><th>Topic</th><th>Path</th></tr>
<tr>
<td><a href="https://forum.example.com/t/page/10">Page</a></td>
<td>/cool-page</td>
</tr>
<tr>
<td>
<a href="https://forum.example.com/t/place/11">Place</a>
</td>
<td>/cool-place</td>
</tr>
</table>
</details>

This will typically be generated in Discourse from Markdown similar to
the following:

# URLs

[details=Mapping table]
| Topic | Path |
| -- | -- |
| https://forum.example.com/t/place/11| /cool-page |
| https://forum.example.com/t/place/11 | /cool-place |

"""
url_soup = self._get_section(index_soup, "URLs", "details")

url_map = {}
warnings = []

if url_soup:
for row in url_soup.tbody("tr"):
row_contents = row("td")
topic = row_contents[0]
path_td = row_contents[-1]
topic_a = topic.a

if not topic_a or not path_td:
warnings.append("Could not parse URL map item {item}")
continue

topic_url = topic_a.get("href", "")
topic_path = urlparse(topic_url).path
topic_match = TOPIC_URL_MATCH.match(topic_path)

pretty_path = path_td.text

if not topic_match or not pretty_path.startswith(
self.url_prefix
):
warnings.append("Could not parse URL map item {item}")
continue

topic_id = int(topic_match.groupdict()["topic_id"])

url_map[pretty_path] = topic_id

# Add the reverse mappings as well, for efficiency
ids_to_paths = dict(reversed(pair) for pair in url_map.items())
url_map.update(ids_to_paths)

# Add the homepage path
home_path = self.url_prefix
if home_path != "/" and home_path.endswith("/"):
home_path = home_path.rstrip("/")
url_map[home_path] = self.index_topic_id
url_map[self.index_topic_id] = home_path

return url_map, warnings

def _replace_notifications(self, soup):
"""
Given some BeautifulSoup of a document,
Expand Down Expand Up @@ -153,22 +239,17 @@ def _replace_notifications(self, soup):

def _get_preamble(self, soup, break_on_title):
"""
Given a BeautifulSoup HTML document,
separate out the HTML at the start, up to
the heading defined in `break_on_title`,
and return it as a BeautifulSoup object
Given a BeautifulSoup HTML document, separate out the HTML
at the start, up to the heading defined in `break_on_title`,
and return it.
"""

heading = soup.find(re.compile("^h[1-6]$"), text=break_on_title)
heading = soup.find(HEADER_REGEX, text=break_on_title)

if not heading:
return soup

preamble_elements = heading.fetchPreviousSiblings()
preamble_elements.reverse()
preamble_html = "".join(map(str, preamble_elements))

return BeautifulSoup(preamble_html, features="lxml")
return "".join(map(str, reversed(list(heading.previous_siblings))))

def parse_topic(self, topic, topic_soup=None):
"""
Expand Down Expand Up @@ -198,14 +279,15 @@ def parse_topic(self, topic, topic_soup=None):

return {
"title": topic["title"],
"body_soup": soup,
"body_html": str(soup),
"updated": humanize.naturaltime(
updated_datetime.replace(tzinfo=None)
),
"topic_path": topic_path,
}

def _get_section(self, soup, title_text):
def _get_section(self, soup, title_text, content_tag=None):
"""
Given some HTML soup and the text of a title within it,
get the content between that title and the next title
Expand All @@ -229,6 +311,8 @@ def _get_section(self, soup, title_text):
return None

heading_tag = heading.name
if content_tag:
return heading.find_next_sibling(content_tag)
section_html = html.split(str(heading))[1]

if f"<{heading_tag}>" in html:
Expand Down