In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from dataclasses import dataclass
import re
import os

## Parse all hyperref

In [13]:
url = "https://www.zoopraha.cz/zvirata-a-expozice/lexikon-zvirat"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

animals_raw = soup.find(id="accordionAbeceda")\
    .find_all("div", class_="para")
animals_url = list()

for animal in animals_raw:
    links = animal.find_all('a')
    animals_url.extend([urlparse(link["href"]) for link in links if link is not None])
    
len(animals_url)

800

## Download all pages

In [7]:
# Downloads all HTML pages of Zoo Prague lexicon in case it gets removed
# A version of these pages was uploaded to Google drive

def get_page_from_url(url):
    page = requests.get(url.geturl())
    soup = BeautifulSoup(page.content, 'html.parser')
    name = ""
    
    for query in url.query.split("&"):
        if "d=" in query:
            name = query.replace("d=", "")
            break
    
    with open(f'pages/{name}.html', "w") as f:
        f.write(soup.prettify())
        
for index,animal_url in enumerate(animals_url):
    #get_page_from_url(animal_url)
    print(f'{index}. Done {animal_url}')

0. Done ParseResult(scheme='https', netloc='www.zoopraha.cz', path='/zvirata-a-expozice/lexikon-zvirat', params='', query='d=3-adax&start=3', fragment='')
1. Done ParseResult(scheme='https', netloc='www.zoopraha.cz', path='/zvirata-a-expozice/lexikon-zvirat', params='', query='d=99-agama-dodomska&start=99', fragment='')
2. Done ParseResult(scheme='https', netloc='www.zoopraha.cz', path='/zvirata-a-expozice/lexikon-zvirat', params='', query='d=510-agama-pobrezni&start=510', fragment='')
3. Done ParseResult(scheme='https', netloc='www.zoopraha.cz', path='/zvirata-a-expozice/lexikon-zvirat', params='', query='d=27-agama-stepni&start=27', fragment='')
4. Done ParseResult(scheme='https', netloc='www.zoopraha.cz', path='/zvirata-a-expozice/lexikon-zvirat', params='', query='d=28-agama-turkestanska&start=28', fragment='')
5. Done ParseResult(scheme='https', netloc='www.zoopraha.cz', path='/zvirata-a-expozice/lexikon-zvirat', params='', query='d=21-agama-vychodoafricka&start=21', fragment='')


## Parse animal data

In [None]:
@dataclass(init=False)
class Animal:
    id: int = -1
    name: str = "–"
    name_latin: str = "–"
    description: str = "–"
    img_href: str = "–"
        
pattern = "\((.*?)\)"
url_base = "https://www.zoopraha.cz/"
animals_url = list()

# Change animals_url to local storage
def tmp_change_to_local():
    for (dirpath, _, filenames) in os.walk("./pages"):
        for filename in filenames:
            full_path = os.path.abspath(os.path.join(dirpath, filename))
            d = full_path.split("/")[-1].replace(".html", "")
            url = urlparse(f'{full_path}?{d}')
            #print(url)
            animals_url.append(url)

def parse_animal_data(url):
    animal = Animal()
    page = requests.get(url.geturl())
    soup = BeautifulSoup(page.content, 'html.parser')
    animal_data = soup.find(id="maincontent")
    
    # Name
    parsed_name = animal_data.find("div", "mainboxtitle").find("h2").text
    animal.name_latin = re.search(pattern, parsed_name).group(1)
    animal.name = parsed_name.replace(f'({animal.name_latin})', "").strip()
    
    paras = animal_data.find_all("div", "para")
    animal.description = paras[0].find("strong").text.strip()
    animal.img_href = urljoin(url_base, paras[0].find("a", "thumbnail")["href"])
    
    
    print(animal)
    
    return animal

tmp_change_to_local()
#parse_animal_data(animals_url[0])
animals_url