In [None]:
!pip install matplotlib
!pip install seaborn


import requests
from tqdm.auto import tqdm

RESTARTING... LET'S TRY USING THE API
Statistical methods like topic modelling probably should really be used with large volumes of data.
To keep this example reasonably small (and therefore fast) we'll try to work with a smallish set of books that is large enough to work reasonably well.
To begin with, we'll use the catalogue API to search for "typhoid".

(see https://developers.wellcomecollection.org/docs/examples for much more about working with the API)

In [None]:
import requests

catalogue_base_url = 'https://api.wellcomecollection.org/catalogue/v2/'

response = requests.get(
    catalogue_base_url + 'works',
    params={
        'include': 'identifiers,subjects,production',
        'pageSize': 100,
        'query': 'typhoid',
    },
)
if response.status_code != 200:
  print('error', file = sys.stderr)
response_data = response.json()
for k, v in response_data.items():
    if k == 'results': continue #there will be loads of this
    print(f'{k}: {v}')


When I ran this code, I got 1099 `totalResults`. Your results may differ, depending upon how Wellcome's collection has changed in the meantime. Anyway, this feels like a nice number of texts to start working with. Let's learn some more about them. We'll start by downloading the catalogue data for all of the pages of results.

In [None]:
from tqdm.auto import tqdm

#let's have a progress bar
catalogue_bar = tqdm(
  unit = 'pages',
  total = response_data['totalPages'],
  desc = 'downloading catalogue data',
)

#We already got the first page of results in the previous cell
catalogue_bar.update(1)
works = response_data['results']

#Now we'll add all of the other pages of results to the list "works"
while 'nextPage' in response_data:
  response = requests.get(response_data['nextPage'])
  catalogue_bar.update(1)
  if response.status_code != 200:
    print('error', file = sys.stderr)
  response_data = response.json()
  works.extend(response_data['results'])


Now that we have all of the catalogue data for our "typhoid" works, let's get a sense of what this covers. We'll just look at the contents of the first record.

In [None]:
from IPython.display import JSON as json_display
json_display(works[0], expanded = True)

This is quite a lot of data! We are interested in text about typhoid, so let's focus on the type of work that this is (is it something written, or something else, like a drawing or a photograph?) and the subject matter. We can use JSONPath to look this up.

We'll start with the "type" of the work. The last entry in the above JSON is workType. The label and type look relevant. Let's examine the values that these can take across the whole collection.

.... might want to add a cell about filtering out non-Wellcome items

In [None]:
from jsonpath_ng.ext import parse as json_parse
from collections import Counter

def count(query, data_list):
  empty = 0
  counter = Counter()
  searcher = json_parse(query)
  for datum in data_list:
    results = searcher.find(datum)
    if len(results) == 0:
      empty += 1
    else:
      for result in results: #we should have a list of DatumInContext
                             #this function assumes the value will be hashable, so it does not handle all queries
                             #for example, it will not work if "value" is a dict or a list
        counter[result.value] += 1
  return empty, counter

def dumpCount(query, data_list, min_proportion = 0):
  emptyCount, counter = count(query, data_list)
  total = len(data_list)
  below_min = 0
  for k, v in counter.most_common():
    proportion = v/total
    if proportion >= min_proportion:
      print(f'{v:4}/{total} ({100 * v/total:3.0f}%) {k}')
    else:
      below_min += 1
  if below_min > 0:
    print(f'{below_min} results hidden as below minimum proportion of {min_proportion * 100:.0f}%')
  if emptyCount > 0:
    print(f'{emptyCount:4}/{total} ({100 * emptyCount/total:3.0f}%) have no value')

print('workType types:')
dumpCount('$.workType.type', works)
print()
print('workType labels:')
dumpCount('$.workType.label', works)


We can see that there are a range of types of works in our results. At the time of writing, 3/4 of the works are books and several others are of types that could reasonably have text (e.g. "Archives and manuscripts", "Student dissertations", "E-books", "Manuscripts", "Journals".

Given that text is provided by an OCR pipeline, it is only printed texts that are likely to have online text available. So we filter down (for now) to just books, e-books and journals.

In [None]:
#This is quite Pythonic, but essentially is filtering works down to a list of just
#books, e-books and journals. We cannot use JSONPath to do this because JSONPath
#can only check values for the purpose of filtering lists, and works appears to
#JSONPath code as single JSON objects (re e.g. https://stackoverflow.com/a/43737750)

printed_works = list(filter(lambda x: x['workType']['label'] == 'Books' or
                                      x['workType']['label'] == 'E-books' or
                                      x['workType']['label'] == 'Journals', works))

Working out the catalogue subject of a work is more complicated. Works in Wellcome Collection are classified according to a range of schemes. If we look in the above JSON, we can also see that the structure is fairly complex, involving a mixture of "Subjects" and "Concepts". Rather than unpick all this, we'll just look at a part of the structure to get a sense of how things are classified. We'll stick with the printed works here.

In [None]:
print('Subjects')
#label of every member of the subjects array which has a type of Subject
dumpCount('$.subjects[?(@.type=="Subject")].label', printed_works, 0.02)

print()
print('Concepts')
#label of every node at any depth beneath subjects which has a type of concept
dumpCount('$.subjects..*[?(@.type=="Concept")].label', printed_works, 0.02)

[This text applied to running the above cell on all works. If I stick with the printed_works version then it will need updating.]

Straight away, we can see that both Subjects and Concepts are not available for about 1/5 of the collection (e.g. 204/1100 ( 19%) have no value).

We can also see that there are a lot of possible values here -- so many that I've written the code to hide all results applying to less than 2% of the works on typhoid.

We can also see that the phrase "typhoid fever" (with varying capitalization) covers 50% of the Subjects and 63% of the Concepts. This suggests that these specific values will get pretty good results in a search. What we cannot tell from this is how many of the works covered by other concepts are actually relevant.

[This could be a good place to introduce the difference between Wellcome and non-Wellcome works and to see what effect filtering down to just Wellcome has.]

One difficulty may be that Wellcome's catalogue includes texts not held by Wellcome. These have the potential to be classified differently.

So let's assume that we are interested in searching works actually held by Wellcome itself and limit down to them.

The way that was suggested to me to do this was to look for works held on either open shelves or in closed stores. This seems to make sense, although perhaps it needs a tweak for purely digital works such as E-books.

For purposes of this notebook we won't worry about purely digital works, so lets filter down. For this purpose, we'll work with all format types again (not just printed works).

In [None]:
print('All availability ids:')
dumpCount('$.availabilities[*].id', works)
print()

open_searcher   = json_parse("$.availabilities[?(@.id=='open-shelves')].id")
closed_searcher = json_parse("$.availabilities[?(@.id=='closed-stores')].id")

wellcome_works = list(filter(lambda x: len(open_searcher.find(x)) > 0 or len(closed_searcher.find(x)) > 0, works))
print(f'{len(wellcome_works)}/{len(works)} works are available in closed and/or open stores (therefore held by Wellcome itself)')

wellcome_printed = list(filter(lambda x: x['workType']['label'] == 'Books' or
                                         x['workType']['label'] == 'E-books' or
                                         x['workType']['label'] == 'Journals', wellcome_works))
print(f'{len(wellcome_printed)} of these are *printed* works that may have OCR text available. These break down as:')
dumpCount('$.workType.label', wellcome_printed)



Now that we have done this, we can look again at concepts and subjects, to see what the coverage is like for the particular works that we are interested in.

In [None]:
print('Subjects')
#label of every member of the subjects array which has a type of Subject
dumpCount('$.subjects[?(@.type=="Subject")].label', wellcome_printed, 0.02)

print()
print('Concepts')
#label of every node at any depth beneath subjects which has a type of concept
dumpCount('$.subjects..*[?(@.type=="Concept")].label', wellcome_printed, 0.02)

Here we see that [analysis may change as I fiddle things around] the subjects "Typhoid Fever - epidemiology" and "Typhoid fever" cover 58% of our original search results as filtered down to printed texts held at Wellcome, or 76% for the concept "Typhoid Fever". Around 15% of works have neither subject nor concept, indicating that this is not explained only by a work not being held at Wellcome.

Let's also get a sense of when and where these works were published.

In [None]:
print("Dates (by frequency)")
dumpCount('$.production[*].dates[*].label', wellcome_printed)
print()
print("Dates (roughly ordered)")
empty, counter = count('$.production[*].dates[*].label', wellcome_printed)
print(sorted(counter.elements()))
print()
print("Places")
dumpCount('$.production[*].places[*].label', wellcome_printed)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re
import math


print('Rough chart of cumulative dates')

#Just grab the dates that are easy to pick up
#This code will skip over an opening square bracket if there is one, then grab a 4 digit number if there is one
#Otherwise it will reject the date as unusable
#This means for example that '18' would be rejected
#While '1900-1920' would be turned into just '1900'
mismatch = 0
searcher = json_parse('$.production[*].dates[*].label')
matcher = re.compile(r'\s*\[?(\d{4})(?:\D|$)')
filtered_results = Counter()
for work in wellcome_printed:
  results = searcher.find(work)

  #a work may have more than one date, we just take the earliest one
  first_date = 99999 #a 4 digit number must be lower than this
  for result in results:
    match = matcher.match(result.value)
    if match:
      x = int(match.group(1))
      if x < first_date:
        first_date = x
  if first_date == 99999: #no date found
    mismatch += 1
  else:
    filtered_results[first_date] += 1

filtered_results = sorted(filtered_results.items())
first_year = filtered_results[0][0]
final_year = filtered_results[-1][0]
total = 0
cumulative = {}
for year, count in filtered_results:
  total += count
  cumulative[year] = total
rounded_up_total = math.ceil(total / 100.0) * 100

ax = seaborn.lineplot(cumulative)
ax.set(
  xlabel = 'Year',
  ylabel = 'Total works',
  ylim = (0, rounded_up_total),
)
plt.show()
print(f'{mismatch} works have no usable date')

Now we have explored what the catalogue can tell us a little, and got a rough sense of the range of texts that we might be able to work with. The next step is to find out which ones actually have digitised text available.