In [5]:
import requests
from bs4 import BeautifulSoup as soup
import os
import concurrent.futures
import itertools
from tqdm import tqdm
from typing import ItemsView

# base_url = 'https://digital.slub-dresden.de/oai/'
dest_path = '/media/wendler/SAMSUNG1TB/bbl-images'

In [2]:
jahr_url = 'https://digital.slub-dresden.de/oai?verb=GetRecord&metadataPrefix=mets&identifier=oai:de:slub-dresden:db:id-39946221X'
response = requests.get(jahr_url)
output_soup = soup(response.content, "lxml")
# dict comprehension mit jahreszahl als key und url der jahresausgabe als value
jahre = {element['orderlabel']: element.find('mets:mptr')['xlink:href'] for element in output_soup.find_all('mets:div', {'type':'year'})}

In [3]:
hefte = dict()
for jahr in jahre:
    response = requests.get(jahre[jahr])
    output_soup = soup(response.content, "lxml")
    hefte.update({element['orderlabel']: element.find('mets:mptr')['xlink:href'] for element in output_soup.find_all('mets:div', {'type':'day'})})


In [7]:
def download_heft(hefte: ItemsView[str, str]) -> str:
    heft = hefte[0]
    url = hefte[1]
    try:
        # xml datei laden
        response = requests.get(url)
        output_soup = soup(response.content, "lxml")

        # id suchen <slub:id type="digital">39946221X-1834010101</slub:id>
        id = output_soup.find('slub:id', {'type':'digital'}).text.split('-')[-1]

        # zielverzeichnis der bilder erstellen
        try:
            os.mkdir(f"{dest_path}/{id}")
        except FileExistsError:
            pass

        # xml datei speichern
        with open(f'{dest_path}/xml/{id}.xml', 'w') as f:
            f.write(output_soup.prettify())
        
        dateien = output_soup.find('mets:filegrp', {'use':'ORIGINAL'}).find_all('mets:flocat')
        
        # heftumfang schreiben
        with open(f"{dest_path}/heftumfang.csv", 'a') as f:
                f.write(f"{id},{heft},{len(dateien)}\n")
        
        # bild-urls schreiben
        with open(f"{dest_path}/image_urls.csv", 'a') as f:
                
            for image in dateien:
                
                f.write(f"{id},{heft},{image['xlink:href']}\n")
                image_r = requests.get(image['xlink:href'])
                with open(f"{dest_path}/{id}/{image['xlink:href'].split('/')[-1]}", 'wb') as f2:
                    f2.write(image_r.content)
        
        return f"{id},{heft},completed\n"
    except Exception as e:
        return str(e)

In [11]:
%%time
# We can use a with statement to ensure threads are cleaned up promptly
test_hefte = dict(itertools.islice(hefte.items(), 10))
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = tqdm(executor.map(download_heft, test_hefte.items()))
    with open("log.log", "w") as log:
        for result in results:
            try:
                log.write(result)
            except Exception as exc:
                print(exc)

10it [00:08,  1.19it/s]

CPU times: user 2.74 s, sys: 1.06 s, total: 3.8 s
Wall time: 8.44 s





In [57]:
%%time
test_url = "https://digital.slub-dresden.de/data/kitodo/Brsfded_39946221X-18340110/Brsfded_39946221X-18340110_mets.xml"
download_heft('1834-01-10', test_url)

1834-01-10 existiert bereits
CPU times: user 190 ms, sys: 51.7 ms, total: 242 ms
Wall time: 1.89 s
