<a href="https://colab.research.google.com/github/jorisschellekens/borb-dev/blob/master/snippet_11_07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install borb

Collecting borb
  Downloading borb-3.0.5-py3-none-any.whl.metadata (3.6 kB)
Downloading borb-3.0.5-py3-none-any.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m2.7/3.1 MB[0m [31m77.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m56.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: borb
Successfully installed borb-3.0.5


In [7]:
# snippet_11_07.ipynb
from borb.pdf import Document
from borb.pdf import GetKeywordsByPagewiseTFIDF
from borb.pdf import GetText
from borb.pdf import PDF
from borb.pdf import Pipeline
from borb.pdf import Source

import pathlib
import requests


def download_pdf(url: str, filename: pathlib.Path) -> None:
    """
    Downloads a PDF file from the given URL and saves it to the specified filename.

    :param url: The URL of the PDF file to download.
    :param filename: The local file path where the downloaded PDF should be saved.

    :raises requests.exceptions.RequestException: If there is an issue with the HTTP request (e.g., network failure, invalid URL, bad response).
    :raises Exception: If any other unexpected error occurs during file writing.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes
        with open(filename, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                pdf_file.write(chunk)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


# Download
download_pdf(
    "https://github.com/borb-pdf/borb-pdf-corpus/raw/refs/heads/master/pdf/0080.pdf",
    pathlib.Path("input.pdf"),
)

# Read PDF
d: Document = PDF.read("input.pdf")

# Process the PDF to get the keywords
output = Pipeline(
    [
        Source(),
        GetKeywordsByPagewiseTFIDF(),
    ]
).process(d)

# Print the keywords
print(output)



{'stairway': 4.852030263919617, 'fireresisting': 4.1588830833596715, 'window': 4.1588830833596715, 'houses': 3.4657359027997265, 'resistance': 2.772588722239781, 'separate': 2.772588722239781, 'systems': 2.0794415416798357, 'alarm': 2.0794415416798357, 'doors': 2.0794415416798357, 'rooflight': 2.0794415416798357}
