Skip to content
Permalink
Browse files
Merge pull request #2173 from bookwyrm-social/html-sanitizer
Html sanitizer
  • Loading branch information
mouse-reeve committed Jul 4, 2022
2 parents 58b23a7 + 9d9b7f3 commit fe33fdcf564a6a5667aef75d5456bea08feab50d
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 110 deletions.
@@ -16,7 +16,7 @@

from bookwyrm import activitypub
from bookwyrm.connectors import get_image
from bookwyrm.sanitize_html import InputHtmlParser
from bookwyrm.utils.sanitizer import clean
from bookwyrm.settings import MEDIA_FULL_URL


@@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField):
def field_from_activity(self, value):
if not value or value == MISSING:
return None
sanitizer = InputHtmlParser()
sanitizer.feed(value)
return sanitizer.get_output()
return clean(value)


class ArrayField(ActivitypubFieldMixin, DjangoArrayField):

This file was deleted.

@@ -11,7 +11,7 @@
env = Env()
env.read_env()
DOMAIN = env("DOMAIN")
VERSION = "0.4.0"
VERSION = "0.4.1"

RELEASE_API = env(
"RELEASE_API",
@@ -2,15 +2,13 @@
from django.db import transaction

from bookwyrm import models
from bookwyrm.sanitize_html import InputHtmlParser
from bookwyrm.utils import sanitizer


def create_generated_note(user, content, mention_books=None, privacy="public"):
"""a note created by the app about user activity"""
# sanitize input html
parser = InputHtmlParser()
parser.feed(content)
content = parser.get_output()
content = sanitizer.clean(content)

with transaction.atomic():
# create but don't save
@@ -1,7 +1,7 @@
""" make sure only valid html gets to the app """
from django.test import TestCase

from bookwyrm.sanitize_html import InputHtmlParser
from bookwyrm.utils.sanitizer import clean


class Sanitizer(TestCase):
@@ -10,53 +10,39 @@ class Sanitizer(TestCase):
def test_no_html(self):
"""just text"""
input_text = "no html "
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
output = clean(input_text)
self.assertEqual(input_text, output)

def test_valid_html(self):
"""leave the html untouched"""
input_text = "<b>yes </b> <i>html</i>"
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
output = clean(input_text)
self.assertEqual(input_text, output)

def test_valid_html_attrs(self):
"""and don't remove useful attributes"""
input_text = '<a href="fish.com">yes </a> <i>html</i>'
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
output = clean(input_text)
self.assertEqual(input_text, output)

def test_valid_html_invalid_attrs(self):
"""do remove un-approved attributes"""
input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>'
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
output = clean(input_text)
self.assertEqual(output, '<a href="fish.com">yes </a> <i>html</i>')

def test_invalid_html(self):
"""remove all html when the html is malformed"""
"""don't allow malformed html"""
input_text = "<b>yes <i>html</i>"
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
self.assertEqual("yes html", output)
output = clean(input_text)
self.assertEqual("<b>yes <i>html</i></b>", output)

input_text = "yes <i></b>html </i>"
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
self.assertEqual("yes html ", output)
output = clean(input_text)
self.assertEqual("yes <i>html </i>", output)

def test_disallowed_html(self):
"""remove disallowed html but keep allowed html"""
input_text = "<div> yes <i>html</i></div>"
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
output = clean(input_text)
self.assertEqual(" yes <i>html</i>", output)
@@ -0,0 +1,26 @@
"""Clean user-provided text"""
import bleach


def clean(input_text):
"""Run through "bleach" """
return bleach.clean(
input_text,
tags=[
"p",
"blockquote",
"br",
"b",
"i",
"strong",
"em",
"pre",
"a",
"span",
"ul",
"ol",
"li",
],
attributes=["href", "rel", "src", "alt"],
strip=True,
)
@@ -16,9 +16,8 @@

from markdown import markdown
from bookwyrm import forms, models
from bookwyrm.sanitize_html import InputHtmlParser
from bookwyrm.settings import DOMAIN
from bookwyrm.utils import regex
from bookwyrm.utils import regex, sanitizer
from .helpers import handle_remote_webfinger, is_api_request
from .helpers import load_date_in_user_tz_as_utc

@@ -268,6 +267,4 @@ def to_markdown(content):
content = format_links(content)
content = markdown(content)
# sanitize resulting html
sanitizer = InputHtmlParser()
sanitizer.feed(content)
return sanitizer.get_output()
return sanitizer.clean(content)
@@ -1,4 +1,5 @@
aiohttp==3.8.1
bleach==5.0.1
celery==5.2.2
colorthief==0.2.1
Django==3.2.13

0 comments on commit fe33fdc

Please sign in to comment.