In [None]:
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from datetime import datetime

In [34]:
@dataclass
class ScrapedData:
    url: str
    scrape_datetime: datetime
    paragraphs: list[int]
    total_words: int

In [77]:
@dataclass
class ScrapedJinaAiData:
    url: str
    scrape_datetime: datetime
    content: str
    total_words: int

In [None]:
def extract_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        scrape_datetime = datetime.now()
        paragraphs = [paragraph.get_text() for paragraph in soup.find_all('p')]
        total_words = sum(len(paragraph.split()) for paragraph in paragraphs)

        scraped_data = ScrapedData(
            url=url,
            scrape_datetime=scrape_datetime,
            paragraphs=paragraphs,
            total_words=total_words
        )

        return scraped_data
    
    except requests.exceptions.RequestException as e:
        print(f"Error while fetching the URL: {e}")

In [83]:
def scrape_jina_ai(url):
    try:
        response = requests.get("http://r.jina.ai/" + url)
        response.raise_for_status()
        scrape_datetime = datetime.now()
        total_words = len(response.text.split())
        scraped_data = ScrapedJinaAiData(
            url="http://" + url,
            scrape_datetime=scrape_datetime,
            content=response.text,
            total_words=total_words
        )
        return scraped_data
    
    except requests.exceptions.RequestException as e:
        print(f"Error while fetching the URL: {e}")

In [None]:
base_url = "wiki.leagueoflegends.com"

In [84]:
url = "wiki.leagueoflegends.com/en-us/Aatrox"
data = scrape_jina_ai(url)

In [85]:
data.url

'http://wiki.leagueoflegends.com/en-us/Aatrox'

In [86]:
champ = response = requests.get("https://wiki.leagueoflegends.com/en-us/Champion")

In [92]:
soup = BeautifulSoup(champ.content, "html.parser")

In [111]:
a = soup.find("td", class_="navbox-cell")

In [113]:
links_in_table = a.find_all('a') if a else []

In [114]:
links_in_table

[<a href="/en-us/Aatrox" title="Aatrox"><img alt="Aatrox" class="mw-file-element" data-file-height="128" data-file-width="128" decoding="async" height="46" loading="lazy" src="/en-us/images/thumb/Aatrox_OriginalSquare.png/46px-Aatrox_OriginalSquare.png?54659" srcset="/en-us/images/thumb/Aatrox_OriginalSquare.png/92px-Aatrox_OriginalSquare.png?54659 2x" width="46"/></a>,
 <a href="/en-us/Ahri" title="Ahri"><img alt="Ahri" class="mw-file-element" data-file-height="128" data-file-width="128" decoding="async" height="46" loading="lazy" src="/en-us/images/thumb/Ahri_OriginalSquare.png/46px-Ahri_OriginalSquare.png?62007" srcset="/en-us/images/thumb/Ahri_OriginalSquare.png/92px-Ahri_OriginalSquare.png?62007 2x" width="46"/></a>,
 <a href="/en-us/Akali" title="Akali"><img alt="Akali" class="mw-file-element" data-file-height="128" data-file-width="128" decoding="async" height="46" loading="lazy" src="/en-us/images/thumb/Akali_OriginalSquare.png/46px-Akali_OriginalSquare.png?b1a30" srcset="/en-u