# Download PDFs

This notebook will download all the PDF files and store them under `pdfs/` so that subsequent processing steps can make use of their content.

In [2]:
import os.path
import glob
import json

import requests

metadata_files = glob.glob("../metadata/article-en-*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

for mdfile in sorted(metadata_files):
    print(f"downloading PDFs in {mdfile}")
    with open(mdfile) as infile:
        i=0
        for line in infile:
            rec = json.loads(line)
            pdf_path = download(rec["url"], rec["id"])
            print(pdf_path)
            i=i+1
        print(f"downloaded {i} PDFs")

downloading PDFs in ../metadata/article-en-test.jsonl
wrote https://www.doria.fi/bitstream/handle/10024/182782/107883-Article Text-213924-1-10-20211120 Ostling.pdf as ../pdfs/www.doria.fi_handle_10024_182782.pdf
../pdfs/www.doria.fi_handle_10024_182782.pdf
wrote https://taju.uniarts.fi/bitstream/handle/10024/6005/Järvinen_Great_Horizons_AM.pdf as ../pdfs/taju.uniarts.fi_handle_10024_6005.pdf
../pdfs/taju.uniarts.fi_handle_10024_6005.pdf
wrote https://osuva.uwasa.fi/bitstream/handle/10024/11652/Osuva_Tabatabaei_Setayesh Nazar_Shafie-khah_Catalão_2020.pdf as ../pdfs/osuva.uwasa.fi_handle_10024_11652.pdf
../pdfs/osuva.uwasa.fi_handle_10024_11652.pdf
wrote https://osuva.uwasa.fi/bitstream/handle/10024/11231/Alander_ym_2013.pdf as ../pdfs/osuva.uwasa.fi_handle_10024_11231.pdf
../pdfs/osuva.uwasa.fi_handle_10024_11231.pdf
wrote https://www.theseus.fi/bitstream/handle/10024/780577/Trustworthy_AI_Covid-19.pdf as ../pdfs/www.theseus.fi_handle_10024_780577.pdf
../pdfs/www.theseus.fi_handle_10024