Skip to content
This repository has been archived by the owner on Dec 19, 2023. It is now read-only.

Commit

Permalink
fix robots parser
Browse files Browse the repository at this point in the history
  • Loading branch information
capjamesg committed Sep 12, 2022
1 parent 46121bc commit c366fe0
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 2 deletions.
2 changes: 2 additions & 0 deletions build_index.py
Expand Up @@ -53,6 +53,8 @@ def process_domain(site: str) -> List[list]:

rp.set_url(f"{protocol}{site}/robots.txt")

rp.read()

sitemap_urls = rp.site_maps() or []

write_log("CRAWL BEGINNING", site)
Expand Down
1 change: 1 addition & 0 deletions crawler/verify_and_process.py
Expand Up @@ -150,6 +150,7 @@ def crawl_urls(
full_url = "https://" + url_domain + url_path

# Do not index URLs blocked in the robots.txt file

if (
robots_parser.can_fetch("*", full_url) is False
):
Expand Down
2 changes: 1 addition & 1 deletion direct_answers/get_answer_to_question.py
Expand Up @@ -196,7 +196,7 @@ def retrieve_answer(
else:
featured_serp_contents = ""

if len(BeautifulSoup(featured_serp_contents, "lxml").get_text()) < 60:
if len(BeautifulSoup(featured_serp_contents, "html.parser").get_text()) < 60:
return None

special_result = {
Expand Down
2 changes: 1 addition & 1 deletion direct_answers/search_result_features.py
Expand Up @@ -70,7 +70,7 @@ def generate_featured_snippet(

# read post with bs4
if post.get("page_content"):
soup = BeautifulSoup(post["page_content"], "lxml")
soup = BeautifulSoup(post["page_content"], "htmlp.arser")
else:
return "", special_result

Expand Down
2 changes: 2 additions & 0 deletions write_logs.py
Expand Up @@ -16,6 +16,8 @@ def write_log(text: str, domain: str = "ADMIN") -> None:
"fields": {"text": f"[*{split_domain}*] " + text},
}
]

print(text)

try:
client.write_points(data, database="search_logs")
Expand Down

0 comments on commit c366fe0

Please sign in to comment.