In [1]:
import os
import time
import typing
import requests
import pandas as pd
from pathlib import Path
from retry import retry
from datetime import datetime
from documentcloud import DocumentCloud
from documentcloud.exceptions import APIError

Create a date from the earliest PDF until today

In [29]:
start_date = datetime(2015, 9, 1)

In [30]:
end_date = datetime.today().date()

In [31]:
date_list = pd.date_range(start_date, end_date).tolist()

In [14]:
def format_pdf_url(dt):
    """Format the provided datetime to fit the PDF URL expected on our source."""
    return f'https://dps.usc.edu/wp-content/uploads/{dt.strftime("%Y")}/{dt.strftime("%m")}/{dt.strftime("%m%d%y")}.pdf'

In [25]:
def download_url(url: str, output_path: Path, timeout: int = 180):
    """Download the provided URL to the provided path."""
    with requests.get(url, stream=True, timeout=timeout) as r:
        if r.status_code == 404:
            print(f"404: {url}")
            return
        with open(output_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

In [2]:
output_dir = Path("./input")

In [3]:
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
for dt in date_list:
    pdf_name = f'{dt.strftime("%m%d%y")}.pdf'
    pdf_path = output_dir / pdf_name
    if pdf_path.exists():
        print(f"{pdf_name} exists. Skipping.")
    else:
        print(f"Downloading {pdf_name}")
        pdf_url = format_pdf_url(dt)
        download_url(pdf_url, pdf_path)
        time.sleep(0.25)

In [14]:
def upload_pdf(pdf_name: str) -> tuple[typing.Optional[str], bool]:
    """Upload the provided object's PDF to DocumentCloud.

    Returns a tuple with the URL to the document and a boolean that indicates if it was uploaded (True) or if it already existed (False).
    """
    # Get PDF path
    pdf_path = output_dir / pdf_name
    assert pdf_path.exists()

    # Connect to DocumentCloud
    client = DocumentCloud(
        os.getenv("DOCUMENTCLOUD_USER"), os.getenv("DOCUMENTCLOUD_PASSWORD")
    )

    # Search to see if it's already up there
    query = f"+project:usc-department-of-public--210827 AND data_uid:{pdf_name}"
    search = client.documents.search(query)

    # If it is, we're done
    if len(list(search)) > 0:
        return search[0].canonical_url, False

    # If it isn't, upload it now
    print(f"☁️ Uploading {pdf_path}")
    try:
        document = client.documents.upload(
            pdf_path,
            title=f"{pdf_name.replace('.pdf', '')}",
            project="210827",
            access="public",
            data={"uid": pdf_name},
        )
        return document.canonical_url, True
    except APIError:
        return None, False

In [15]:
upload_pdf("120222.pdf")

('https://www.documentcloud.org/documents/23346558-120222pdf', False)

In [None]:
for p in output_dir.glob("*.pdf"):
    upload_pdf(f"{p.stem}.pdf")
    time.sleep(0.25)

☁️ Uploading input/061818.pdf
☁️ Uploading input/020921.pdf
☁️ Uploading input/041917.pdf
☁️ Uploading input/051321.pdf
☁️ Uploading input/031621.pdf
☁️ Uploading input/052021.pdf
☁️ Uploading input/042321.pdf
☁️ Uploading input/042618.pdf
☁️ Uploading input/022321.pdf
☁️ Uploading input/022520.pdf
☁️ Uploading input/082621.pdf
☁️ Uploading input/080720.pdf
☁️ Uploading input/072618.pdf
☁️ Uploading input/090216.pdf
☁️ Uploading input/070218.pdf
☁️ Uploading input/081916.pdf
☁️ Uploading input/082917.pdf
☁️ Uploading input/092419.pdf
☁️ Uploading input/051216.pdf
☁️ Uploading input/101118.pdf
☁️ Uploading input/072220.pdf
☁️ Uploading input/042716.pdf
☁️ Uploading input/083018.pdf
☁️ Uploading input/080321.pdf
☁️ Uploading input/091015.pdf
☁️ Uploading input/012821.pdf
☁️ Uploading input/090418.pdf
☁️ Uploading input/092216.pdf
☁️ Uploading input/080117.pdf
☁️ Uploading input/050319.pdf
☁️ Uploading input/112519.pdf
☁️ Uploading input/050916.pdf
☁️ Uploading input/081116.pdf
☁️ Uploadi