Skip to content

Commit

Permalink
Merge 7699247 into 80821a3
Browse files Browse the repository at this point in the history
  • Loading branch information
sparkiegeek committed Feb 24, 2022
2 parents 80821a3 + 7699247 commit 1dd1502
Showing 1 changed file with 97 additions and 15 deletions.
112 changes: 97 additions & 15 deletions webapp/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from canonicalwebteam.discourse_docs import DocParser
from canonicalwebteam.discourse_docs.parsers import TOPIC_URL_MATCH

from functools import cached_property
import re
from urllib.parse import urlparse

# Packages
import dateutil.parser
import humanize
import validators
from bs4 import BeautifulSoup, NavigableString
from jinja2 import Template

Expand Down Expand Up @@ -57,6 +59,85 @@ def parse(self):
self._replace_links(raw_index_soup, topics)
)

def _parse_redirect_map(self, index_soup):
"""
Given the HTML soup of an index topic
extract the redirect mappings from the "Redirects" section.
The URLs section should contain a table of
"Path" to "Location" mappings
(extra markup around this table doesn't matter)
e.g.:
<h1>Redirects</h1>
<details>
<summary>Mapping table</summary>
<table>
<tr><th>Path</th><th>Location</th></tr>
<tr>
<td>/my-funky-path</td>
<td>/cool-page</td>
</tr>
<tr>
<td>/some/other/path</td>
<td>https://example.com/cooler-place</td>
</tr>
</table>
</details>
This will typically be generated in Discourse from Markdown similar to
the following:
# Redirects
[details=Mapping table]
| Path | Path |
| -- | -- |
| /my-funky-path | /cool-page |
| /some/other/path | https://example.com/cooler-place |
"""

redirect_soup = self._get_section(index_soup, "Redirects", "details")
redirect_map = {}
warnings = []

if redirect_soup:
for row in redirect_soup.select("tr:has(td)"):
path_cell = row.select_one("td:first-child")
location_cell = row.select_one("td:last-child")

if not path_cell or not location_cell:
warnings.append(
f"Could not parse redirect map {path_cell}"
)
continue

path = path_cell.text
location = location_cell.text

if not path.startswith(self.url_prefix):
warnings.append(f"Could not parse redirect map for {path}")
continue

if not (
location.startswith(self.url_prefix)
or validators.url(location, public=True)
):
warnings.append(
f"Redirect map location {location} is invalid"
)
continue

if path in self.url_map:
warnings.append(
f"Redirect path {path} clashes with URL map"
)
continue

redirect_map[path] = location

return redirect_map, warnings

def _parse_url_map(self, index_soup):
"""
Given the HTML soup of an index topic
Expand Down Expand Up @@ -142,6 +223,17 @@ def _parse_url_map(self, index_soup):

return url_map, warnings

@cached_property
def notification_template(self):
notification_html = (
"<div class='{{ notification_class }}'>"
"<div class='p-notification__response'>"
"{{ contents | safe }}"
"</div></div>"
)

return Template(notification_html)

def _replace_notifications(self, soup):
"""
Given some BeautifulSoup of a document,
Expand All @@ -167,14 +259,6 @@ def _replace_notifications(self, soup):
</div>
"""

notification_html = (
"<div class='{{ notification_class }}'>"
"<div class='p-notification__response'>"
"{{ contents | safe }}"
"</div></div>"
)

notification_template = Template(notification_html)
for note_string in soup.findAll(text=re.compile("ⓘ ")):
first_paragraph = note_string.parent
blockquote = first_paragraph.parent
Expand All @@ -197,7 +281,7 @@ def _replace_notifications(self, soup):
r"^\n?<p([^>]*)>ⓘ +", r"<p\1>", notification_html
)

notification = notification_template.render(
notification = self.notification_template.render(
notification_class="p-notification",
contents=notification_html,
)
Expand Down Expand Up @@ -226,7 +310,7 @@ def _replace_notifications(self, soup):
if isinstance(first_item, NavigableString):
first_item.replace_with(first_item.lstrip(" "))

notification = notification_template.render(
notification = self.notification_template.render(
notification_class="p-notification--caution",
contents=blockquote.encode_contents().decode("utf-8"),
)
Expand Down Expand Up @@ -262,7 +346,6 @@ def parse_topic(self, topic, topic_soup=None):
(e.g. "3 days ago")
- forum_link: The link to the original forum post
"""

updated_datetime = dateutil.parser.parse(
topic["post_stream"]["posts"][0]["updated_at"]
)
Expand Down Expand Up @@ -304,18 +387,17 @@ def _get_section(self, soup, title_text, content_tag=None):
<p>Content</p>
"""
html = str(soup)
heading = soup.find(HEADER_REGEX, text=title_text)

if not heading:
return None

heading_tag = heading.name
if content_tag:
return heading.find_next_sibling(content_tag)
html = str(soup)
section_html = html.split(str(heading))[1]

if f"<{heading_tag}>" in html:
section_html = section_html.split(f"<{heading_tag}>")[0]
if f"<{heading.name}>" in html:
section_html = section_html.split(f"<{heading.name}>")[0]

return BeautifulSoup(section_html, features="lxml")

0 comments on commit 1dd1502

Please sign in to comment.