fix robots parser

capjamesg · Sep 12, 2022 · c366fe0 · c366fe0
1 parent 46121bc
commit c366fe0
Show file tree

Hide file tree

Showing 5 changed files with 7 additions and 2 deletions.
diff --git a/build_index.py b/build_index.py
@@ -53,6 +53,8 @@ def process_domain(site: str) -> List[list]:
 
     rp.set_url(f"{protocol}{site}/robots.txt")
 
+    rp.read()
+
     sitemap_urls = rp.site_maps() or []
 
     write_log("CRAWL BEGINNING", site)

diff --git a/crawler/verify_and_process.py b/crawler/verify_and_process.py
@@ -150,6 +150,7 @@ def crawl_urls(
     full_url = "https://" + url_domain + url_path
 
     # Do not index URLs blocked in the robots.txt file
+
     if (
         robots_parser.can_fetch("*", full_url) is False
     ):

diff --git a/direct_answers/get_answer_to_question.py b/direct_answers/get_answer_to_question.py
@@ -196,7 +196,7 @@ def retrieve_answer(
         else:
             featured_serp_contents = ""
 
-        if len(BeautifulSoup(featured_serp_contents, "lxml").get_text()) < 60:
+        if len(BeautifulSoup(featured_serp_contents, "html.parser").get_text()) < 60:
             return None
 
         special_result = {

diff --git a/direct_answers/search_result_features.py b/direct_answers/search_result_features.py
@@ -70,7 +70,7 @@ def generate_featured_snippet(
 
     # read post with bs4
     if post.get("page_content"):
-        soup = BeautifulSoup(post["page_content"], "lxml")
+        soup = BeautifulSoup(post["page_content"], "htmlp.arser")
     else:
         return "", special_result
 

diff --git a/write_logs.py b/write_logs.py
@@ -16,6 +16,8 @@ def write_log(text: str, domain: str = "ADMIN") -> None:
             "fields": {"text": f"[*{split_domain}*]   " + text},
         }
     ]
+
+    print(text)
 
     try:
         client.write_points(data, database="search_logs")