# Eastern Oklahoma Collections

To start, we create a list of URLs to process

In [2]:
urls = [
    'https://www.loc.gov/search/?fa=location%3Amuskogee',
    'https://www.loc.gov/search/?fa=location:muskogee+county',
    'https://www.loc.gov/search/?fa=location:tahlequah',
    'https://www.loc.gov/search/?fa=contributor:cherokee+nation,+oklahoma'
]

Next, let's download all the data and remove duplicates

In [3]:
import locgov_data as ld

per_page = "50"
ids = []
all_results = []
for url in urls:
    results = ld.express_search(url, per_page)
    for result in results:
        if not (result['id'] in ids):
            all_results.append(result)
            ids.append(result['id'])
    print(f"Retrieved {len(results):,} from {url}")

print(f"Retrieved {len(all_results):,} unique results")

Retrieved 593 from https://www.loc.gov/search/?fa=location%3Amuskogee
Retrieved 702 from https://www.loc.gov/search/?fa=location:muskogee+county
Retrieved 35 from https://www.loc.gov/search/?fa=location:tahlequah
Retrieved 22 from https://www.loc.gov/search/?fa=contributor:cherokee+nation,+oklahoma
Retrieved 879 unique results


Next, let's do some light clean-up of metadata to produce a JSON data file for the UI.

In [15]:
import json
from urllib.parse import parse_qs

json_data = []
destination_file = "../ui/source/_data/eastern_ok.json"
collections_values = []
for result in all_results:
    # skip items with no images
    if not ("image_url" in result and len(result["image_url"]) > 1):
        continue
    item = {}
    item["title"] = result["title"]
    item["url"] = result["url"]
    if "partof" in result and len(result["partof"]) > 0:
        item["collections"] = result["partof"]
    if "contributor" in result and len(result["contributor"]) > 0:
        item["contributor"] = result["contributor"][0]
    item["thumb"] = result["image_url"][0]
    item["image"] = result["image_url"][-1]
    # extract dimensions
    dimensions = ""
    if "#" in item["image"]:
        url, query_string = tuple(item["image"].split("#", 1))
        dims = parse_qs(query_string)
        if "w" in dims and "h" in dims:
            dimensions = f"{dims['w'][0]}-{dims['h'][0]}"
    item["image_dims"] = dimensions
    if "collections" in item:
        collections_values += item["collections"]
    json_data.append(item)
with open(destination_file, "w", encoding="utf8") as f:
    json.dump(json_data, f, indent=4)
print(f"Wrote {len(json_data):,} items to {destination_file}")

Wrote 870 items to ../ui/source/_data/eastern_ok.json


Print a table of collections and counts

In [17]:
import collections
import pandas as pd
pd.set_option('max_colwidth', 400)
counter = collections.Counter(collections_values)
counts = dict(counter).items()
counts = sorted(counts, key=lambda d: -d[1])
pd.DataFrame(counts, columns=['Name', 'Count'])

Unnamed: 0,Name,Count
0,prints and photographs division,505
1,catalog,498
2,american memory,490
3,prints & photographs online catalog (library of congress),461
4,farm security administration/office of war information black-and-white negatives,415
5,chronicling america,298
6,"the muskogee cimeter (muskogee, indian territory, okla.) 1901-19??",298
7,serial and government publications division,298
8,lot 523,84
9,lot 526,66
