Skip to content

Commit

Permalink
metadata fix: empty content (#545)
Browse files Browse the repository at this point in the history
* Check for empty content on meta tags - metadata.py

* Added tests for empty meta tags metadata_tests.py

* fixed test

* fixed test
  • Loading branch information
felipehertzer committed Apr 5, 2024
1 parent fb3e174 commit 8125043
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
5 changes: 4 additions & 1 deletion tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def test_titles():
# too short/empty
metadata = extract_metadata('<html><body><h3 class="title">T</h3><h3 id="title"></h3></body></html>')
assert metadata.title is None

metadata = extract_metadata('<html><head><title>Test Title</title><meta property="og:title" content=" " /></head><body><h1>First</h1></body></html>')
assert metadata.title == 'First'
metadata = extract_metadata('<html><head><title>Test Title</title><meta name="title" content=" " /></head><body><h1>First</h1></body></html>')
assert metadata.title == 'First'
metadata = extract_metadata('<html><head><title>Test Title</title></head><body></body></html>')
assert metadata.title == 'Test Title'
metadata = extract_metadata('<html><body><h1>First</h1><h1>Second</h1></body></html>')
Expand Down
4 changes: 2 additions & 2 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def extract_opengraph(tree):
# detect OpenGraph schema
for elem in tree.xpath('.//head/meta[starts-with(@property, "og:")]'):
# safeguard
if not elem.get('content'):
if not elem.get('content') or elem.get('content').isspace():
continue
# site name
if elem.get('property') == 'og:site_name':
Expand Down Expand Up @@ -212,7 +212,7 @@ def examine_meta(tree):
# skim through meta tags
for elem in tree.iterfind('.//head/meta[@content]'):
# content
if not elem.get('content'):
if not elem.get('content') or elem.get('content').isspace():
continue
content_attr = HTML_STRIP_TAG.sub('', elem.get('content'))
# image info
Expand Down

0 comments on commit 8125043

Please sign in to comment.