Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #14 - Logging improvements #15

Merged
merged 2 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import uuid
from contextlib import contextmanager
from random import uniform
from typing import Union, Callable, Any, List, Dict, Iterator
from typing import Union, Callable, Any, List, Dict, Iterator, Optional

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
Expand Down Expand Up @@ -34,7 +34,7 @@ def fetch_page(
min_wait_seconds: float,
max_wait_seconds: float,
stop_after_n: int,
) -> Callable[[Callable[..., Any]], None]:
) -> Callable[[Callable[..., Any], Optional[str]], None]:
"""
Curried function that fetches the given URL and retries the request if the page load fails.
Example usage: fetch_page(driver, url, 10, 3)(lambda: driver.find_element(By.ID, 'foo').text != "failed")
Expand All @@ -52,15 +52,15 @@ def fetch_page(
stop=stop_after_attempt(stop_after_n),
reraise=True,
)
def wrapper(check_method: Callable) -> None:
def wrapper(check_method: Callable, err_msg: Optional[str] = None) -> None:
print(f"Requesting URL: {url}")
driver.get(url)
randomized_wait = uniform(min_wait_seconds, max_wait_seconds)
print(f"Waiting {randomized_wait} seconds for the request to complete...")
time.sleep(randomized_wait)
if not check_method():
raise PageCheckFailedError(
"Page check failed, page load seems to have failed"
err_msg or f"Page check failed for url {url}"
)
print(f"Successfully fetched URL: {url}")

Expand Down
43 changes: 19 additions & 24 deletions src/text_search.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import sys
import urllib.parse
from datetime import date, timedelta
from math import ceil
Expand Down Expand Up @@ -266,7 +267,8 @@ def _fetch_search_request_results(
lambda: self.driver.find_element(
By.XPATH, TEXT_SEARCH_RESULTS_TABLE_XPATH
).text.strip()
!= ""
!= "",
f"First search request failed for URL {TEXT_SEARCH_BASE_URL}{search_request_url_args} ...",
)

# Get number of pages
Expand All @@ -285,24 +287,22 @@ def _fetch_search_request_results(
lambda: self.driver.find_element(
By.XPATH, TEXT_SEARCH_RESULTS_TABLE_XPATH
).text.strip()
!= ""
!= "",
f"Search request failed for page {i} at URL {paginated_url}, skipping page...",
)

page_results = extract_html_table_rows(
self.driver, By.XPATH, TEXT_SEARCH_RESULTS_TABLE_XPATH
)(lambda x: self._parse_table_rows(x, paginated_url))
yield page_results
except PageCheckFailedError as e:
print(f"Failed to fetch page at URL {paginated_url}, skipping...")
print(f"Error: {e}")
print(e)
continue
except ResultsTableNotFoundError as e:
print(f"Did not find results table at URL {paginated_url}, skipping...")
print(f"Error: {e}")
except ResultsTableNotFoundError:
print(f"Could not find results table on page {i} at URL {paginated_url}, skipping page...")
continue
except Exception as e:
print(f"Unexpected error occurred while fetching page {i}, skipping...")
print(f"Error: {e}")
print(f"Unexpected {e.__class__.__name__} error occurred while fetching page {i} at URL {paginated_url}, skipping page: {e}")
continue

def _generate_search_requests(
Expand Down Expand Up @@ -452,9 +452,8 @@ def text_search(

except Exception as e:
print(
f"Unexpected error occurred while fetching search request results for request parameters '{r}': {e}"
f"Skipping search request due to an unexpected {e.__class__.__name__} for request parameters '{r}': {e}"
)
print(f"Skipping...")

write_results_to_file(
itertools.chain(*search_requests_results),
Expand All @@ -481,24 +480,20 @@ def _fetch_first_page_results_number(
lambda: self.driver.find_element(
By.XPATH, TEXT_SEARCH_RESULTS_TABLE_XPATH
).text.strip()
!= ""
!= "",
f"No results found on first page at URL {url}, aborting...\n"
f"Please verify that the search/wait/retry parameters are correct and try again.\n"
f"We recommend disabling headless mode for debugging purposes."
)
except PageCheckFailedError as e:
print(
f"First page check at URL failed due to {e.__class__.__name__}: \n{e}"
)
print(f"No results found for first page at URL {url}, aborting...")
print(
f"Please verify that the search/wait/retry parameters are correct and try again."
)
print(f"We recommend disabling headless mode for debugging purposes.")
raise
print(e)
sys.exit(1)
GalenReich marked this conversation as resolved.
Show resolved Hide resolved

# If we cannot get number of results after retries, abort
try:
num_results = self._parse_number_of_results()
return num_results
except Exception as e:
print(f"Failed to parse number of results for URL {url}, aborting...")
print(f"Error: {e}")
raise
print(f"Execution aborting due to a {e.__class__.__name__} error raised "
f"while parsing number of results for first page at URL {url}: {e}")
sys.exit(1)