From 8125043c389e3c168ac17f4bda212751c0b8f13b Mon Sep 17 00:00:00 2001 From: Felipe Hertzer Date: Fri, 5 Apr 2024 22:14:42 +1100 Subject: [PATCH] metadata fix: empty content (#545) * Check for empty content on meta tags - metadata.py * Added tests for empty meta tags metadata_tests.py * fixed test * fixed test --- tests/metadata_tests.py | 5 ++++- trafilatura/metadata.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/metadata_tests.py b/tests/metadata_tests.py index 551e9e3d..512a5148 100644 --- a/tests/metadata_tests.py +++ b/tests/metadata_tests.py @@ -26,7 +26,10 @@ def test_titles(): # too short/empty metadata = extract_metadata('

T

') assert metadata.title is None - + metadata = extract_metadata('Test Title

First

') + assert metadata.title == 'First' + metadata = extract_metadata('Test Title

First

') + assert metadata.title == 'First' metadata = extract_metadata('Test Title') assert metadata.title == 'Test Title' metadata = extract_metadata('

First

Second

') diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py index 3574818d..d2dda0c8 100644 --- a/trafilatura/metadata.py +++ b/trafilatura/metadata.py @@ -163,7 +163,7 @@ def extract_opengraph(tree): # detect OpenGraph schema for elem in tree.xpath('.//head/meta[starts-with(@property, "og:")]'): # safeguard - if not elem.get('content'): + if not elem.get('content') or elem.get('content').isspace(): continue # site name if elem.get('property') == 'og:site_name': @@ -212,7 +212,7 @@ def examine_meta(tree): # skim through meta tags for elem in tree.iterfind('.//head/meta[@content]'): # content - if not elem.get('content'): + if not elem.get('content') or elem.get('content').isspace(): continue content_attr = HTML_STRIP_TAG.sub('', elem.get('content')) # image info