Skip to content

Commit

Permalink
fix: consistent list.copy()
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 7, 2021
1 parent 45568ac commit 33cd96b
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
def tree_cleaning(tree, include_tables, include_images=False):
'''Prune the tree by discarding unwanted elements'''
# determine cleaning strategy
cleaner, cleaning_list, stripping_list = \
HTML_CLEANER, MANUALLY_CLEANED, MANUALLY_STRIPPED
cleaning_list, stripping_list = \
MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
if include_tables is False:
cleaning_list.append('table')
if include_images is True:
Expand All @@ -55,12 +55,12 @@ def tree_cleaning(tree, include_tables, include_images=False):
for expression in cleaning_list:
for element in tree.getiterator(expression):
try:
element.drop_tree()
element.drop_tree() # faster when applicable
except AttributeError:
element.getparent().remove(element)
cleaner.kill_tags, cleaner.remove_tags = cleaning_list, stripping_list
HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list
# save space and processing time
return cleaner.clean_html(prune_html(tree))
return HTML_CLEANER.clean_html(prune_html(tree))


def prune_html(tree):
Expand Down

0 comments on commit 33cd96b

Please sign in to comment.