Getting the data... there is an API, but it limits to 10,000 results... in any case, it is straightforward to work with a snapshot... we can use some lightly adapted code from https://developers.wellcomecollection.org/docs/examples/working-with-snapshots-of-the-api to acquire the snapshot

In [None]:
from pathlib import Path
import requests
from tqdm.auto import tqdm
import gzip
import os
import io

snapshot_url = "https://data.wellcomecollection.org/catalogue/v2/works.json.gz"

data_dir = Path("./data").resolve()
data_dir.mkdir(exist_ok=True)

file_name = Path(snapshot_url).parts[-1]
zipped_path = data_dir / file_name
unzipped_path = zipped_path.with_suffix("")

# check whether the file already exists before doing any work
if not unzipped_path.exists():
  if not zipped_path.exists():

    # make a request to the snapshot URL and stream the response
    r = requests.get(snapshot_url, stream=True)
    
    # use the length of the response to create a progress bar for the download
    download_progress_bar = tqdm(
      unit="bytes",
      total=int(r.headers["Content-Length"]),
      desc=f"Downloading {file_name}",
    )

    # write the streamed response to our file in chunks of 1024 bytes
    with open(zipped_path, "wb") as f:
      for chunk in r.iter_content(chunk_size=1024):
        if chunk:
          f.write(chunk)
          download_progress_bar.update(len(chunk))

      download_progress_bar.close()

  # open the zipped file, and the unzipped file
  with gzip.open(zipped_path, "rb") as f_in, open(unzipped_path, "wb") as f_out:
    unzip_progress_bar = tqdm(
      unit="bytes",
      total=f_in.seek(0, io.SEEK_END), # measure the unzipped length of the zipped file using `.seek()`
      desc=f"unzipping {file_name}",
    )

    # we used `.seek()` to move the cursor to the end of the file, so we need to
    # move it back to the start before we start reading
    f_in.seek(0)

    # read the zipped file in chunks of 1MB
    for chunk in iter(lambda: f_in.read(1024 * 1024), b""):
      f_out.write(chunk)
      unzip_progress_bar.update(len(chunk))

    unzip_progress_bar.close()