In [63]:
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from dataclasses import asdict
from datetime import datetime
from urllib.parse import urljoin
from urllib.parse import urlparse
import json
from pathlib import Path
import re
import time
from os import listdir
from urllib3.util import Retry
from requests import Session
from requests.adapters import HTTPAdapter

In [22]:
@dataclass
class ScrapedData:
    url: str
    scrape_datetime: datetime
    paragraphs: list[int]
    total_words: int

In [23]:
@dataclass
class ScrapedJinaAiData:
    url: str
    scrape_datetime: datetime
    content: str
    total_words: int

In [87]:
def scrape_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        scrape_datetime = datetime.now()
        paragraphs = [paragraph.get_text() for paragraph in soup.find_all('p')]
        total_words = sum(len(paragraph.split()) for paragraph in paragraphs)

        scraped_data = ScrapedData(
            url=url,
            scrape_datetime=scrape_datetime,
            paragraphs=paragraphs,
            total_words=total_words
        )

        return scraped_data
    
    except requests.exceptions.RequestException as e:
        print(f"Error while fetching the URL: {e}")

In [88]:
def scrape_content_jina_ai(url, session): #TODO add time.sleep(2), maximum of 20 RPM
    try:
        response = session.get("http://r.jina.ai/" + url)
        response.raise_for_status()
        scrape_datetime = datetime.now()
        total_words = len(response.text.split())
        scraped_data = ScrapedJinaAiData(
            url="http://" + url,
            scrape_datetime=scrape_datetime,
            content=response.text,
            total_words=total_words
        )
        return scraped_data
    
    except requests.exceptions.RequestException as e:
        print(f"Error while fetching the URL: {e}")

In [89]:
def scrape_content_jina_ai_API_key(url): #TODO 
    scraped_data = None
    return scraped_data

In [90]:
class Wiki_Crawler:
    def __init__(self):
        self.scrape_method = None

    def scrape(self, url, session):
        if self.scrape_method is None:
            print("no scrape method set!")
            return None 
        return self.scrape_method(url, session)
    
    def set_scrape_method(self, scrape_method):
        self.scrape_method = scrape_method

    def create_filename(self, url):
        filename = urlparse(url).path.split('/')[-1]
        if not filename:
            filename = "home"
        else:
            filename = re.sub('[^A-Za-z0-9]+', '', filename)       
        return filename

    def wiki_crawler(self, base_url, max_sites, save_folder):
        visited = set()
        visited.add(base_url)
        queue = [base_url]
        counter = 0
        already_saved_files = listdir(save_folder)
        s = Session()
        retries = Retry(
            total=3,
            backoff_factor=1.5,
            status_forcelist=[422 ,502, 503, 504],
        )
        s.mount('https://', HTTPAdapter(max_retries=retries))

        while queue and counter < max_sites:
            counter += 1
            url = queue.pop(0)

            try:
                response = s.get(url)
                soup = BeautifulSoup(response.text, "html.parser")

                for link in soup.find_all('a', href=True):
                    href = link.get('href')
                    absolute_url = urljoin(base_url, href)
                    
                    if urlparse(absolute_url).hostname == urlparse(base_url).hostname and absolute_url not in visited:
                        visited.add(absolute_url)
                        queue.append(absolute_url)
                
                filename = self.create_filename(url) + ".json"
                save_path = save_folder / filename

                if filename in already_saved_files:
                    continue

                scraped_data = self.scrape(url, s)
                with open(save_path, "w", encoding="utf-8") as f:
                    json.dump(asdict(scraped_data), f, ensure_ascii=False, indent=4, default=str) 

                print(f"Scraped: {url}")
                time.sleep(2)
                        
            except requests.exceptions.RequestException as e:
                print(f"Error while fetching {url}: {e}")

        return visited

In [91]:
save_folder = Path().resolve().parent / "data/pokemon_json"

In [92]:
crawler = Wiki_Crawler()
crawler.set_scrape_method(scrape_content_jina_ai)

In [None]:
vis = crawler.wiki_crawler("https://www.pokewiki.de",2000,save_folder)