In [None]:
print("Installing and importing packages...")

# Uninstall and reinstall packages for a clean environment
!pip uninstall -q -y jhutils
!pip uninstall -q -y bs4
!pip install -q --disable-pip-version-check git+https://github.com/jdchart/jh-py-utils.git
!pip install -q --disable-pip-version-check beautifulsoup4

# Import packages
from jhutils.local_files import read_json, write_json, get_audio_info, get_video_info, get_image_info
import jhutils.online_files
from jhutils.misc import print_progress_bar
import os
import shutil
import requests
from bs4 import BeautifulSoup

print("👍 Ready!")

# Library of Congress Collection

In [21]:
COLLECTION_URL = "https://www.loc.gov/collections/fsa-owi-color-photographs"
TEMP_FOLDER = os.path.join(os.getcwd(), "_TEMP")
OUTPUT_PATH = os.path.join(os.getcwd(), "fsa-owi-color-photographs.json")

print(f"Processing {COLLECTION_URL}")

# Get urls
def process_page(url, page_index):
    response = requests.get(f"{url}/?fa=access-restricted:false&&st=list&c=150&sp={page_index}")
    if response.status_code != 200:
        return None
    else:
        soup = BeautifulSoup(response.text, 'html.parser')
        element = soup.find(class_ = "search-results")
        if not element:
            return None
        else:
            images = element.find_all('img', class_ = "iconic")
            img_srcs = [img['src'] for img in images if 'src' in img.attrs]
            if not img_srcs:
                return None
            return img_srcs

image_list = []
found_last_page = False
count = 1
while found_last_page == False:
    rep = process_page(COLLECTION_URL, count)
    if rep == None:
        found_last_page = True
        break
    else:
        image_list.extend(rep)
        count = count + 1

print(f"Found {len(image_list)} images...")

if os.path.isdir(TEMP_FOLDER) == False:
    os.makedirs(TEMP_FOLDER)

output_data = {"images" : []}

for i, item in enumerate(image_list):
    image_url_thumb = item.split("#")[0]
    image_url = image_url_thumb.replace("_150px", "v")

    if image_url != "https://tile.loc.gov/storage-services/service/pnp/fsac/1a33000/1a33800/1a33881v.jpg":

        print_progress_bar(i, len(image_list) - 1, f"Treating {os.path.basename(image_url)}")

        to_add = {"url" : image_url}
        jhutils.online_files.download(image_url, dir = TEMP_FOLDER)
        img_info = get_image_info(os.path.join(TEMP_FOLDER, os.path.basename(image_url)))
        to_add["data"] = img_info

        jhutils.online_files.download(image_url_thumb, dir = TEMP_FOLDER)
        thumb_info = get_image_info(os.path.join(TEMP_FOLDER, os.path.basename(image_url_thumb)))
        to_add["thumbnail"] = {"url" : image_url_thumb, "data" : thumb_info}

        output_data["images"].append(to_add)
        write_json(OUTPUT_PATH, output_data)

shutil.rmtree(TEMP_FOLDER)


Processing https://www.loc.gov/collections/fsa-owi-color-photographs
Found 1623 images...
|█████---------------------------------------------| 10.2% Complete Treating 1a34189v.jpg

ChunkedEncodingError: ('Connection broken: IncompleteRead(7847 bytes read, 364578 more expected)', IncompleteRead(7847 bytes read, 364578 more expected))