In [None]:
#This cell just installs dependencies -- setting up things that we need

!pip install matplotlib
!pip install seaborn

#You can see helper.py in the folder view on the left.
#It contains functions that this notebook uses.
#This lets us hide distracting details/complexity, but you can
#look inside it if you want to know how these things are working.
import helper

import requests
from tqdm.auto import tqdm
from jsonpath_ng.ext import parse as json_parse
from collections import Counter
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re
import math

from IPython.display import JSON as json_display
from IPython.core.display import Markdown

RESTARTING... LET'S TRY USING THE API
Statistical methods like topic modelling probably should really be used with large volumes of data.
To keep this example reasonably small (and therefore fast) we'll try to work with a smallish set of books that is large enough to work reasonably well.
To begin with, we'll use the catalogue API to search for "typhoid".

(see https://developers.wellcomecollection.org/docs/examples for much more about working with the API)

In [None]:
catalogue_base_url = 'https://api.wellcomecollection.org/catalogue/v2/'

response = requests.get(
    catalogue_base_url + 'works',
    params={
        'include': 'identifiers,subjects,production',
        'pageSize': 100,
        'query': 'typhoid',
    },
)
if response.status_code != 200:
  print('error', file = sys.stderr)
response_data = response.json()
for k, v in response_data.items():
    if k == 'results': continue #there will be loads of this
    print(f'{k}: {v}')


Last time I updated this cell I got 1099 `totalResults`. Your results may differ, depending upon how Wellcome's collection has changed in the meantime. Anyway, this feels like a nice number of texts to start working with. Let's learn some more about them. We'll start by downloading the catalogue data for all of the pages of results.

In [None]:
#let's have a progress bar
catalogue_bar = tqdm(
  unit = 'pages',
  total = response_data['totalPages'],
  desc = 'downloading catalogue data',
)

#We already got the first page of results in the previous cell
catalogue_bar.update(1)
works = response_data['results']

#Now we'll add all of the other pages of results to the list "works"
while 'nextPage' in response_data:
  response = requests.get(response_data['nextPage'])
  catalogue_bar.update(1)
  if response.status_code != 200:
    print('error', file = sys.stderr)
  response_data = response.json()
  works.extend(response_data['results'])


Now that we have all of the catalogue data for our "typhoid" works, let's get a sense of what this covers. We'll just look at the contents of the first record.

In [None]:
json_display(works[0], expanded = True)

This is quite a lot of data! We are interested in text about typhoid, so let's focus on the type of work that this is (is it something written, or something else, like a drawing or a photograph?) and the subject matter. We can use JSONPath to look this up.

We'll start with the "type" of the work. The last entry in the above JSON is workType. The label and type look relevant. Let's examine the values that these can take across the whole collection.

.... might want to add a cell about filtering out non-Wellcome items

In [None]:
#This cell uses dumpCount from helper.py.
#dumpCount takes a JSONPath query and a list of JSONL objects
#It prints text describing query results

print('workType types:')
helper.dumpCount('$.workType.type', works)
print()
print('workType labels:')
helper.dumpCount('$.workType.label', works)

We can see that there are a range of types of works in our results. At the time of writing, 3/4 of the works are books and several others are of types that could reasonably have text (e.g. "Archives and manuscripts", "Student dissertations", "E-books", "Manuscripts", "Journals".

Given that text is provided by an OCR pipeline, it is only printed texts that are likely to have online text available. So we filter down (for now) to just books, e-books and journals.

In [None]:
#This is quite Pythonic, but essentially is filtering works down to a list of just
#books, e-books and journals. We cannot use JSONPath to do this because JSONPath
#can only check values for the purpose of filtering lists, and works appears to
#JSONPath code as single JSON objects (re e.g. https://stackoverflow.com/a/43737750)

printed_works = list(filter(lambda x: x['workType']['label'] == 'Books' or
                                      x['workType']['label'] == 'E-books' or
                                      x['workType']['label'] == 'Journals', works))

Working out the catalogue subject of a work is more complicated. Works in Wellcome Collection are classified according to a range of schemes. If we look in the above JSON, we can also see that the structure is fairly complex, involving a mixture of "Subjects" and "Concepts". Rather than unpick all this, we'll just look at a part of the structure to get a sense of how things are classified. We'll stick with the printed works here.

In [None]:
print('Subjects')
#label of every member of the subjects array which has a type of Subject
helper.dumpCount('$.subjects[?(@.type=="Subject")].label', printed_works, 0.02)

print()
print('Concepts')
#label of every node at any depth beneath subjects which has a type of concept
helper.dumpCount('$.subjects..*[?(@.type=="Concept")].label', printed_works, 0.02)

Notice that we now have two columns of numbers.

Before, we were dealing with formats. Each catalogue entry refers to a single physical object --- a book, a journal, a picture, etc etc --- and so it has only one format.

Now, we are dealing with subjects. Each catalogue entry may have more than one subject.

The "entries" column is counting catalogue entries. Because an entry can have multiple subjects, the sum of times that we see each subject, across all subjects in the corpus, is going to be higher than the number of entries in the corpus.

Let's unpack this with a small example. To begin, we can look at the subjects in a sample of ten works.

In [None]:
#My original list was just the first 10 works in printed_works, but rather than rely on that staying the
#same into the future, I now get the same works by identifier. I've also manipulated the list a bit to 
#make a better example.
sample = helper.works_by_ids(printed_works,
                               ['bxa3fqrw','f56ccxnd','jf55amap','pw7sr9zn','q5pqqysq','qzy6ufxp','rxyt9ncw','vqhzjwd5','ab2ncfmj', 'sqwwchy7'])

display(Markdown('\n'.join(helper.dump_labels(sample, '$.subjects[?(@.type=="Subject")]', 'subjects', '^Typhoid [Ff]ever$'))))


You may notice that there are two different ways of identifying the general subject of "typhoid fever" -- one spelling fever with a capital F and one with a lower case f. These two spellings also have distinct IDs. If we wanted to find all books with this general subject then we would have to use both spellings. Even then, we would have to watch out for cases like that copy of "On typhoid fever", which has both spellings.

You may also notice that there are two copies of William Thomson's "On typhoid fever". As it happens, one of these copies has two different "Typhoid fever" subjects, and one has only one of them.

But to return to our point about how to count things, and proportions of things. What we see here is a total of 10 printed works. These have varying numbers of subjects, totalling 18 (3 * 1 + 6 * 2 + 1 * 3 = 18).

Let's now run our dumpCount function over the same ten works, first to get the titles, then to get the subjects.

In [None]:
helper.dumpCount('$.title', sample)

Running it on the titles show us that "On typhoid fever" appears twice and all of the others appear once. "On typhoid fever" is therefore 20% of the sample.

The numbers here are out of 10, because there are 10 works, and we only get one list because the number of titles equals the number of works.

Now let's look at subjects.

In [None]:
helper.dumpCount('$.subjects[?(@.type=="Subject")].label', sample)

There are different numbers of works (10) and subjects (18), as we saw above. Because of this we get two lists: the "entries" list counts works and the "hits" list counts subjects.

The left-hand "entries" column is still counting "by work" --- each number is out of 10, the number of works.

4/10 works, or 40% of all works, have the subject "Typhoid Fever", and another 3/10, or 30% of all works, have the subject "Typhoid fever". 

The first work, "Typhoid fever and chronic typhoid carriers", has the subjects "Typhoid Fever - epidemiology" and "Typhoid Fever - transmission", so it effectively appears twice in the left-hand columns, once for each subject. All works will be counted once in this column for each subject that they have. Because of this, the total of entries in the left-hand column is greater than 10 --- in fact, it will be 18, the total number of subjects. If we count up the percentages in this column, they will come to 200%.

The right-hand "hits" column is counting "by subject" --- each number is out of 18, the total number of subjects possessed by all of the books. Just as "On typhoid fever" appeared twice in our lists of titles, some subjects appear more than once when we list all of the subjects of all of the books.

Note also that the inconsistent nature of the data leads to some misrepresentation. If we normalize by case, the proportions will change a little --- let's try that.

In [None]:
#normalize the subjects in a rough and ready way -- this normalizes label case but might not be consistent in other attributes, such as id
#this is good enough for present purposes

normalized_works = [deepcopy(work) for work in sample] #misnomer: we are only normalizing subject label
for n_w in normalized_works:
  subjects = n_w['subjects']
  seen = set()
  normalized_subjects = []
  for subject in subjects:
    if subject['type'] != 'Subject': continue #if it is not actually a subject, move on to the next subject
    lowered = subject['label'].lower()
    if lowered in seen: continue #if this work already has a subject with this label, move on to the next subject
    subject['label'] = lowered
    seen.add(lowered)
    normalized_subjects.append(subject)
  n_w['subjects'] = normalized_subjects
display(Markdown('\n'.join(helper.dump_labels(normalized_works, '$.subjects[?(@.type=="Subject")]', 'subjects', '^Typhoid [Ff]ever$'))))


Our general "Typhoid Fever" subject is now consistently "typhoid fever".

We still have our same ten titles but now only 16 subjects because "Typhoid fever: a history" and the first copy of "On typhoid fever" no longer have the same subject label listed twice with different cases.

So let's perform the same analysis with this slightly cleaner data.

In [None]:
helper.dumpCount('$.subjects[?(@.type=="Subject")].label', normalized_works)

We now see easily see that 60% of the works have the most general "typhoid fever" subject, which is also 38% of all of the subjects covered.

Which might be roughly what we would expect in a corpus based on a search for "typhus".

[This text applied to running the above cell on all works. If I stick with the printed_works version then it will need updating.]

Straight away, we can see that both Subjects and Concepts are not available for about 1/5 of the collection (e.g. 204/1100 ( 19%) have no value).

We can also see that there are a lot of possible values here -- so many that I've written the code to hide all results applying to less than 2% of the works on typhoid.

We can also see that the phrase "typhoid fever" (with varying capitalization) covers 50% of the Subjects and 63% of the Concepts. This suggests that these specific values will get pretty good results in a search. What we cannot tell from this is how many of the works covered by other concepts are actually relevant.

[This could be a good place to introduce the difference between Wellcome and non-Wellcome works and to see what effect filtering down to just Wellcome has.]

One difficulty may be that Wellcome's catalogue includes texts not held by Wellcome. These have the potential to be classified differently.

So let's assume that we are interested in searching works actually held by Wellcome itself and limit down to them.

The way that was suggested to me to do this was to look for works held on either open shelves or in closed stores. This seems to make sense, although perhaps it needs a tweak for purely digital works such as E-books.

For purposes of this notebook we won't worry about purely digital works, so lets filter down. For this purpose, we'll work with all format types again (not just printed works).

In [None]:
print('All availability ids:')
helper.dumpCount('$.availabilities[*].id', works)
print()

open_searcher   = json_parse("$.availabilities[?(@.id=='open-shelves')].id")
closed_searcher = json_parse("$.availabilities[?(@.id=='closed-stores')].id")

wellcome_works = list(filter(lambda x: len(open_searcher.find(x)) > 0 or len(closed_searcher.find(x)) > 0, works))
print(f'{len(wellcome_works)}/{len(works)} works are available in closed and/or open stores (therefore held by Wellcome itself)')

wellcome_printed = list(filter(lambda x: x['workType']['label'] == 'Books' or
                                         x['workType']['label'] == 'E-books' or
                                         x['workType']['label'] == 'Journals', wellcome_works))
print(f'{len(wellcome_printed)} of these are *printed* works that may have OCR text available. These break down as:')
helper.dumpCount('$.workType.label', wellcome_printed)



Now that we have done this, we can look again at concepts and subjects, to see what the coverage is like for the particular works that we are interested in.

In [None]:
print('Subjects')
#label of every member of the subjects array which has a type of Subject
helper.dumpCount('$.subjects[?(@.type=="Subject")].label', wellcome_printed, 0.02)

print()
print('Concepts')
#label of every node at any depth beneath subjects which has a type of concept
helper.dumpCount('$.subjects..*[?(@.type=="Concept")].label', wellcome_printed, 0.02)

Here we see that [analysis may change as I fiddle things around] the subjects "Typhoid Fever - epidemiology" and "Typhoid fever" cover 58% of our original search results as filtered down to printed texts held at Wellcome, or 76% for the concept "Typhoid Fever". Around 15% of works have neither subject nor concept, indicating that this is not explained only by a work not being held at Wellcome.

Let's also get a sense of when and where these works were published.

In [None]:
print("Dates (by frequency)")
helper.dumpCount('$.production[*].dates[*].label', wellcome_printed)
print()
print("Dates (roughly ordered)")
empty, counter = helper.count('$.production[*].dates[*].label', wellcome_printed)
print(sorted(counter.elements()))
print()
print("Places")
helper.dumpCount('$.production[*].places[*].label', wellcome_printed)

In [None]:
print('Rough chart of cumulative dates')

#Just grab the dates that are easy to pick up
#This code will skip over an opening square bracket if there is one, then grab a 4 digit number if there is one
#Otherwise it will reject the date as unusable
#This means for example that '18' would be rejected
#While '1900-1920' would be turned into just '1900'
mismatch = 0
searcher = json_parse('$.production[*].dates[*].label')
matcher = re.compile(r'\s*\[?(\d{4})(?:\D|$)')
filtered_results = Counter()
for work in wellcome_printed:
  results = searcher.find(work)

  #a work may have more than one date, we just take the earliest one
  first_date = 99999 #a 4 digit number must be lower than this
  for result in results:
    match = matcher.match(result.value)
    if match:
      x = int(match.group(1))
      if x < first_date:
        first_date = x
  if first_date == 99999: #no date found
    mismatch += 1
  else:
    filtered_results[first_date] += 1

filtered_results = sorted(filtered_results.items())
first_year = filtered_results[0][0]
final_year = filtered_results[-1][0]
total = 0
cumulative = {}
for year, count in filtered_results:
  total += count
  cumulative[year] = total
rounded_up_total = math.ceil(total / 100.0) * 100

ax = seaborn.lineplot(cumulative)
ax.set(
  xlabel = 'Year',
  ylabel = 'Total works',
  ylim = (0, rounded_up_total),
)
plt.show()
print(f'{mismatch} works have no usable date')

Now we have explored what the catalogue can tell us a little, and got a rough sense of the range of texts that we might be able to work with. The next step is to find out which ones actually have digitised text available.