From 8bb639401d4126e32c864bcd7de8cfdfdd1f614f Mon Sep 17 00:00:00 2001 From: van51 Date: Thu, 11 May 2023 08:46:16 +0000 Subject: [PATCH] Return matching element with result --- autoscraper/auto_scraper.py | 15 +++++++++++---- autoscraper/utils.py | 3 ++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py index 20d723d..be95acd 100644 --- a/autoscraper/auto_scraper.py +++ b/autoscraper/auto_scraper.py @@ -360,6 +360,7 @@ def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): i, wanted_attr, is_full_url, url, is_non_rec_text ), getattr(i, "child_index", 0), + i ) for i in parents ] @@ -395,6 +396,7 @@ def _get_result_with_stack_index_based( stack["is_non_rec_text"], ), getattr(p, "child_index", 0), + p ) ] if not kwargs.get("keep_blank", False): @@ -439,19 +441,20 @@ def _get_result_by_func( grouped_result[group_id] += result return self._clean_result( - result_list, grouped_result, grouped, group_by_alias, unique, keep_order + result_list, grouped_result, grouped, group_by_alias, unique, keep_order, + kwargs.get("return_elements", False) ) @staticmethod def _clean_result( - result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order + result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order, return_elements ): if not grouped and not grouped_by_alias: if unique is None: unique = True if keep_order: result_list = sorted(result_list, key=lambda x: x.index) - result = [x.text for x in result_list] + result = [x.text if not return_elements else x.element for x in result_list] if unique: result = unique_hashable(result) return result @@ -459,7 +462,7 @@ def _clean_result( for k, val in grouped_result.items(): if grouped_by_alias: val = sorted(val, key=lambda x: x.index) - val = [x.text for x in val] + val = [x.text if not return_elements else x.element for x in val] if unique: val = unique_hashable(val) grouped_result[k] = val @@ -479,6 +482,7 @@ def get_result_similar( keep_blank=False, keep_order=False, contain_sibling_leaves=False, + return_elements=False ): """ Gets similar results based on the previously learned rules. @@ -540,6 +544,7 @@ def get_result_similar( keep_blank=keep_blank, keep_order=keep_order, contain_sibling_leaves=contain_sibling_leaves, + return_element=return_elements ) def get_result_exact( @@ -553,6 +558,7 @@ def get_result_exact( unique=None, attr_fuzz_ratio=1.0, keep_blank=False, + return_elements=False ): """ Gets exact results based on the previously learned rules. @@ -606,6 +612,7 @@ def get_result_exact( unique, attr_fuzz_ratio, keep_blank=keep_blank, + return_elements=return_elements ) def get_result( diff --git a/autoscraper/utils.py b/autoscraper/utils.py index 5193708..a2866f8 100644 --- a/autoscraper/utils.py +++ b/autoscraper/utils.py @@ -48,9 +48,10 @@ def text_match(t1, t2, ratio_limit): class ResultItem(): - def __init__(self, text, index): + def __init__(self, text, index, element): self.text = text self.index = index + self.element = element def __str__(self): return self.text