In [4]:
from __future__ import print_function, division, absolute_import
from bs4 import BeautifulSoup
from copy import deepcopy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import webdriver_manager
import os, time, traceback, json, requests, shutil, glob, uuid

# base metadata object to be updated on a per collection then per image basis
base_meta = {
  'image': '',
  'collection': '',
  'title': '',
  'artist': '',
  'date': '',
  'text': '',
  'location': '',
  'permalink': '',
}

# helper to make images and metadata dir for a given collection
def make_out_dirs(root_out_dir):
  dirs = []
  for i in ['images', 'metadata']:
    out_dir = os.path.join('data', root_out_dir, i)
    if not os.path.exists(out_dir):
      os.makedirs(out_dir)
    dirs.append(out_dir)
  return dirs

# helper to save both image and metadata to disk for a given collection
def save_record(root_out_dir, img_data, meta, min_size=150):
  images_dir, meta_dir = make_out_dirs(root_out_dir)
  if len(img_data) < min_size: return
  out_id = str(uuid.uuid1())
  filename = out_id + '.jpg'
  meta['image'] = filename
  # save image
  with open(os.path.join(images_dir, filename), 'wb') as out:
    out.write(img_data)
  # save metadata
  with open(os.path.join(meta_dir, filename.replace('.jpg', '.json')), 'w') as out:
    json.dump(meta, out)

# Download Voynichese Images

The following downloads the images and metadata for a dataset of inputs to the Neural Neighbors web application.

Metadata for each image should ideally posess the following fields:

| Image | Collection | Title | Artist | Date | Text | Location | Permalink |

In [None]:
from utils.helpers import pages
from os.path import join
import requests
import os

def download_from_url(url, out_path):
  '''Download a file at location `url` and write to `out_path`'''
  if not os.path.exists(out_path):
    r = requests.get(url, allow_redirects=True)
    open(out_path, 'wb').write(r.content)

def download_voynichese_data(page_ids):
  '''Download page images and word coords from voynichese.com'''
  for i in ['coords', 'images']:
    out_dir = os.path.join('utils', 'voynichese', i)
    if not os.path.exists(out_dir):
      os.makedirs(out_dir)
  for page_id in page_ids:
    # download coords
    url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'
    download_from_url(url, join('utils', 'voynichese', 'coords', page_id + '.json'))
    # download page
    url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'
    download_from_url(url, join('utils', 'voynichese', 'images', page_id + '.jpg'))

print(' * preparing to download', sorted(pages.keys()))
download_voynichese_data(pages.keys())

# Download Biodiversity Heritage Library Images (#1)

In [None]:
from copy import deepcopy
from os.path import join
import flickr_api # use python 2 kernel
import os

# configure api keys
credential_path = os.path.join('utils', 'flickr.credentials')
flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')
if os.path.exists(credential_path):
  flickr_api.set_auth_handler(credential_path)
else:
  a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object
  perms = 'read' # set the required permissions
  url = a.get_authorization_url(perms)
  print(url) # open the printed url in a browser and agree; paste verifier code in set_verifier function below
  a.set_verifier('5b58510bb6f0641b')
  flickr_api.set_auth_handler(a)
  a.save(credential_path)
  
# run initial query
user = flickr_api.Person.findByUserName('biodivlibrary')
errored = []
photos = user.getPhotos(page=2, perpage=100)

In [None]:
images_dir, meta_dir = make_out_dirs('biodivlibrary')
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Biodiversity Heritage Library',
  'permalink': 'https://www.flickr.com/photos/biodivlibrary/'
})

page_num = 119
while page_num < photos.info.pages:
  print(' * page num', page_num)
  page_photos = user.getPhotos(page=page_num, perpage=100)  
  for i in page_photos:
    try:
      # save photo
      image_id_out = str(uuid.uuid1())
      out_path = join(images_dir, image_id_out)
      print(' * saving', out_path)
      i.save(out_path)
      time.sleep(2)
      # save meta
      info = i.getInfo() # url
      url = info.get('urls', {}).get('url', [])
      url = url[0].get('text', collection_meta['permalink']) if url else collection_meta['permalink']
      meta = deepcopy(collection_meta)
      meta.update({
        'image': out_path,
        'permalink': url,
        'title': info.get('title', ''),
        'text': info.get('description', ''),
        'tags': [i.text for i in info.get('tags', [])],
        'views': info.get('views', ''),
      })
      out_path = join(meta_dir, image_id_out + '.json')
      with open(out_path, 'w') as out:
        json.dump(meta, out)
    except Exception as exc:
      print(' * could not save', i, '\n', traceback.format_exc().splitlines())
      errored.append(i)
  page_num += 1

# Download Smithsonian Images (#11)

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Smithsonian Botany Collection',
  'permalink': 'https://library.si.edu/topic/botany/images',
})

# get images
page_number = 0
while page_number != None:
  print(' * fetching page', page_number)
  r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))
  text = r.content.decode('utf8')
  soup = BeautifulSoup(text, 'html.parser')
  imgs = soup.select('.bg-hover-highlight')
  for idx, i in enumerate(imgs):
    print('   * fetching image idx', idx, 'on page')
    try:
      # download image
      src = i.find('img')['src']
      image_id = os.path.basename(src).split('id=')[1]
      img_data = requests.get('https://ids.si.edu/ids/deliveryService?id=' + image_id, allow_redirects=True).content
      # process metadata
      href = 'https://library.si.edu' + i.find('a')['href']
      r = requests.get(href)
      soup = BeautifulSoup(r.text, 'html.parser')
      meta = deepcopy(collection_meta)
      try:
        title = soup.select('#goi-title')[0].get_text()
      except Exception:
        print(' * title not available', href)
        title = ''
      try:
        copy = ''
        t1 = soup.select('.views-field-field-original-caption')
        t2 = soup.select('.views-label-field-citation')
        for k in [t1, t2]:
          if k:
            copy += k[0].get_text() + ' '
      except Exception:
        print(' * text not available', href)
        copy = ''
      try:
        tags = [j.get_text() for i in soup.select('.ig-icon-subjects') for j in i.select('a')]
      except Exception:
        print(' * tags not available', href)
        tags = []
      meta.update({
        'title': ' '.join(title.split()).strip(),
        'text': ' '.join(copy.split()).strip(),
        'permalink': href,
        'tags': tags,
      })
      # save records
      save_record('smithsonian-botany', img_data, meta)
      
    except Exception as exc:
      print(' ! err', i, exc)
  if '/topic/botany/images?page=' + str(page_number+1) in text:
    page_number += 1
  else:
    page_number = None

# Download Padova Images (#19)

In [None]:
import requests, os

images_dir, meta_dir = make_out_dirs('padova')
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Padova',
  'permalink': 'https://medicaltraditions.org/padova/images',
  'title': 'Padova, Biblioteca del Seminario, 194',
})

for i in ['recto', 'verso']:
  for j in range(1,28,1):
    # download image
    num = str(j)
    while len(num) < 3:
      num = '0' + num
    img = '{0}-{1}.jpg'.format(num, i)
    url = 'https://medicaltraditions.org/images/stories/manuscripts/demateriamedica/{0}'.format(img)
    img_data = requests.get(url).content
    # save meta
    meta = deepcopy(collection_meta)
    save_record('padova', img_data, meta)

# Download Penn Database Images (#36)

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'text': 'Illustrated Herbal',
  'collection': 'Penn Manuscripts',
})

for z in [1, 16, 31, 46, 61]:
  text = requests.get('http://sceti.library.upenn.edu/ljs/ljsbrowse.cfm?StartRow={0}#'.format(z)).text
  soup = BeautifulSoup(text, 'html.parser')
  for i in soup.select('#schoenlist')[0].select('tr.hioff'):
    # only ms with images will have thumbnails in the first table column
    thumb, book_id, title, date = [w.get_text().strip() for w in i.select('td')]
    if not book_id: continue
    thumb = i.select('td')[0]
    if not thumb: continue
    thumb_path = thumb.select('img')
    if not thumb_path: continue
    if 'ljsthumbs' not in thumb_path[0]['src']: continue
    print(' * fetching', book_id)
    meta = deepcopy(collection_meta)
    meta['title'] = title
    meta['date'] = date
    meta['collection'] = book_id
    for j in range(500):
      jdx = str(j)
      while len(jdx) < 4: jdx = '0' + jdx
      for k in ['front', 'body']:
        try:
          # save images
          img = '{0}_{1}{2}'.format(book_id, k, jdx)
          url = 'http://images.library.upenn.edu/mrsidsceti/bin/image_jpeg.pl?coll=schoenberg&subcoll={0}&image={1}.sid&level=2'.format(book_id, img)
          img_data = requests.get(url).content
          meta.update({
            'permalink': url,
          })
          save_record('penn-manuscripts', img_data, meta, min_size=1000)
        except:
          print(' ! could not fetch page', url, traceback.format_exc().splitlines())
    # many ms don't have images, so clear those out
    if len(glob.glob(os.path.join(images_dir, '*'))) == 0:
      shutil.rmtree(os.path.join('data', book_id))

# Bax - Italian Herbal (#33)

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'UVM Italian Herbal',
})

# system uses Islandora and given a list of thumbnails like:
# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages
# one can transform each thumbnail url from
# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/TN/view
# to another DATASTREAM in Islandora https://wiki.duraspace.org/display/ISLANDORA/APPENDIX+C+-+DATASTREAM+REFERENCE
# e.g. http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/JPG/view
for page_idx, page in enumerate(range(1, 12, 1)):
  url = 'http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages?page={0}'.format(page)
  html = requests.get(url).text
  soup = BeautifulSoup(html, 'html.parser')
  for idx, i in enumerate(soup.select('.islandora-objects-grid')[0].select('img')):
    # save image
    src = i['src']
    url = src.replace('/TN/', '/JPG/')
    img_data = requests.get(url).content
    # save meta
    meta = deepcopy(collection_meta)
    meta.update({
      'title': src,
      'permalink': url,
    })
    save_record('uvm-italian-herbal', img_data, meta, min_size=1000)

# Bax - General History of the Things of New Spain (#33)
Author: Sahagun

In [None]:
import os, requests

collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Sahagun - General History of the Things of New Spain',
  'title': 'Sahagun - General History of the Things of New Spain',
})

for i in range(1000):
  url = 'https://content.wdl.org/10622/service/thumbnail/1403114302/1024x1024/1/{0}.jpg'.format(i)
  img_data = requests.get(url).content
  # save the metadata
  meta = deepcopy(collection_meta)
  meta['permalink'] = url
  save_record('sahagun', img_data, meta)

# Bax - Kitab (#33)
Manuscrit Kitāb 'ağā'ib al-maḫlūqāt wa ġarā'ib ... Qazwīnī, Zakariyyā ibn Muḥammad ibn Maḥmūd al- (1203-1283). Auteur du texte

In [None]:
import os, requests
from copy import deepcopy

collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Kitab',
  'title': 'Kitab',
  'author': 'Qazwīnī, Zakariyyā ibn Muḥammad ibn',
  'text': "Ms. orné de figures coloriées, représentant les animaux et les plantes décrits dans le texte. Il est daté de l'an 1176 de l'hégire (1762-1763 de J. C.).",
})

for i in range(1000):
  url = 'https://gallica.bnf.fr/ark:/12148/btv1b8406160j/f{0}.medres'.format(i)
  img_data = requests.get(url).content
  meta = deepcopy(collection_meta)
  meta['permalink'] = url
  save_record('kitab', img_data, meta)

# MS. Canon. Misc. 408 & Bodleian Medieval Manuscripts (#34)

In [None]:
# pages: https://iiif.bodleian.ox.ac.uk/iiif/mirador/c444f7e2-ca30-48ae-87b5-54f93d6ed046
# manifest: https://iiif.bodleian.ox.ac.uk/iiif/manifest/db2fade4-61ee-4a11-a894-19361c551eed.json
# image: https://iiif.bodleian.ox.ac.uk/iiif/image/ca0cddf4-d66c-41a5-84b6-c72aee34ba65/full/600,/0/default.jpg
# info https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/info.json

# from the manifest, find the first value in the sequences key, then on that page each page is listed
# as the @id value in the canvases list

# download all bodleian medieval manuscripts
for idx, i in enumerate(range(1,5,1)): # page index
  url = 'https://medieval.bodleian.ox.ac.uk/?f_inclusive%5Bms_digitized_s%5D%5B%5D=Yes&ms_title_s=&name_s=&op=AND&page={0}&per_page=100&search_field=advanced&sort=score+desc%2C+sort_title+asc'.format(i)
  soup = BeautifulSoup(requests.get(url).text, 'html.parser')
  for jdx, j in enumerate(soup.select('#documents')[0].select('.document')): # manuscript index
    try:
      href = 'https://medieval.bodleian.ox.ac.uk' + j.select('a')[0]['href']
      page = BeautifulSoup(requests.get(href).text, 'html.parser')
      title = page.select('h1')[0].get_text()
      # find the link to the manuscript images
      href = page.select('.surrogates')[0].select('a')[0]['href']
      book_id = href.split('/')[-1]
      # fetch the mirador page for this book
      url = 'https://iiif.bodleian.ox.ac.uk/iiif/manifest/{0}.json'.format(book_id)
      manifest = requests.get(url).json()
      sequence_url = manifest['sequences'][0]['@id']
      sequence = requests.get(sequence_url).json()
      image_ids = [k['@id'].split('/')[-1].split('.')[0] for k in sequence['canvases']]
      meta = deepcopy(base_meta)
      meta.update({
        'title': ''.join([k['value'] for k in manifest['metadata'] if k['label'] == 'Title']),
        'text': ''.join([k['value'] for k in manifest['metadata'] if k['label'] == 'Additional Information']),
        'date': ''.join([k['value'] for k in manifest['metadata'] if k['label'] == 'Date Statement']),
        'collection': 'Bodleian Medieval Manuscripts',
      })
    except Exception as exc:
      print(' * could not process manifest', jdx, j, traceback.format_exc().splitlines())
      continue
    for image_id in image_ids:
      try:
        # download image
        image_url = 'https://iiif.bodleian.ox.ac.uk/iiif/image/{0}/full/600,/0/default.jpg'.format(image_id)
        img_data = requests.get(image_url).content
        meta.update({
          'image': image_url,
          'permalink': image_url,
        })
        save_record('bodleian-medieval', img_data, meta)
      except:
        print(' ! could not download image', image_id, traceback.format_exc().splitlines())

# Download Bodleian Selected Images (#9)

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Bodleian Herbals',
  'permalink': 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+',
})

def download_query_results():
  driver = webdriver.Chrome()
  driver.get(url)
  time.sleep(3)
  text = driver.page_source
  soup = BeautifulSoup(text, 'html.parser')
  collections = driver.find_elements_by_css_selector('.result')
  print(' * got', len(collections), 'collections')
  for i in collections: # i = manuscript
    # strange app
    i.click()
    time.sleep(2)
    meta = deepcopy(collection_meta)
    meta.update({
      'title': soup.select('#metadata_title_text')[0].get_text(),
      'description': ' '.join(soup.select('#metadata_shelfmark_text')[0].get_text().split()).strip(),
      'text': ' '.join(soup.select('#metadata_shelfmark')[0].get_text().split()).strip(),
    })
    children = i.get_attribute('data-children').strip().split(',')
    for j in children:
      print(' * fetching', j)
      try:
        img_url = 'https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(j)
        meta.update({'permalink': img_url})
        img_data = requests.get(img_url).content
        save_record('bodleian-herbals', img_data, meta)
      except Exception as exc:
        print(' ! err', j, exc)

# get images
root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'
url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'

download_query_results()

# Hunt Images (#10)

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Hunt',
  'author': '',
  'text': '',
})

for i in range(1,185,1):
  url = 'http://huntbot.org/artdb/art-collection-search?page={0}'.format(i)
  html = requests.get(url).text
  soup = BeautifulSoup(html, 'html.parser')
  for idx, j in enumerate(soup.select('.views-field img')):
    try:
      url = j['src']
      img_data = requests.get(url).content
      meta = deepcopy(collection_meta)
      meta.update({
        'title': 'Hunt-{0}'.format(i),
        'permalink': url,
      })
      save_record('hunt', img_data, meta)
    except Exception as exc:
      print(' ! could not download', url, traceback.format_exc().splitlines())

# Morgan Images (#25)

Good candidate for discussing data "scraping" with student

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Morgan',
})

root_url = 'http://ica.themorgan.org/'
for i in range(0,4,1): # page idx
  url = 'https://www.themorgan.org/manuscripts/list?page={0}'.format(i)
  html = requests.get(url).text
  soup = BeautifulSoup(html, 'html.parser')
  for idx, j in enumerate(soup.select('.views-field-field-collection-images-link a')): # manuscript idx on page
    url = j['href']
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    for kdx, k in enumerate(soup.select('.starter-template img')): # image idx in manuscript
      url = root_url + k['src'].replace('../', '')
      try:
        img_data = requests.get(url).content
        meta = deepcopy(collection_meta)
        meta.update({
          'permalink': url,
        })
        save_record('morgan', img_data, meta)
      except Exception as exc:
        print(' ! trouble with', idx, kdx, traceback.format_exc().splitlines())

# British Library

Has 700 medieval manuscripts in full color

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'British Library',
})

driver = webdriver.Chrome()

def get_soup(url, timeout=5):
  driver.get(url)
  time.sleep(2)
  try:
    text = text = driver.page_source
    if '<title>429 Too Many Requests</title>' in text:
      print(' * nginx timed out')
      time.sleep(timeout)
      return get_html(url, timeout=timeout+5)
    return BeautifulSoup(text, 'html.parser')
  except Exception as exc:
    print(' ! error parsing html', traceback.format_exc().splitlines())
    return ''

def get_image(url, timeout=5):
  r = requests.get(url)
  if '<title>429 Too Many Requests</title>' in r.text:
    time.sleep(timeout)
    return get_image(url, timeout=timeout+5)
  return r.content
  
for idx, i in enumerate(range(1,79,1)): # page idx
  url = 'https://www.bl.uk/collection-items?formats=manuscript&page={0}'.format(i)
  soup = get_soup(url)
  for jdx, j in enumerate(soup.select('.pnl-title a')): # manuscript idx
    url = 'https://www.bl.uk' + j['href']
    soup = get_soup(url)
    try:
      images_url = 'https://www.bl.uk' + soup.select('#view-image-button')[0]['href']
    except:
      print(' ! no image link', url)
      continue
    soup = get_soup(images_url)
    try:
      title = soup.find(id='full-title').get_text() # such a wonky api
    except Exception as exc:
      title = ''
      print(' ! no title', url, exc, soup)
    try:
      text = soup.find_all('p')[1].get_text()
    except Exception as exc:
      text = ''
      print(' ! no text', url, exc, soup)
    
    for kdx, k in enumerate(soup.select('.img-viewer-thumbs img')):
      try:
        tail = '?'.join(k['src'].split('?')[:-1]) + '?w=500' # get images at desired width
        url = 'https://www.bl.uk' + tail
        img_data = get_image(url)
        meta = deepcopy(collection_meta)
        meta.update({
          'title': title,
          'text': text,
          'permalink': url,
        })
        save_record('british-library', img_data, meta)
      except Exception as exc:
        print(' ! could not download image', url, traceback.format_exc().splitlines())

# Okra pseudo-Apoleius

In [None]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'Unikassel Versitat',
  'permalink': 'https://orka.bibliothek.uni-kassel.de/viewer/!thumbs/1357143974502/',
})

# get images
page_number = 0
while page_number <= 81:
  print(' * fetching page', page_number)
  r = requests.get('https://orka.bibliothek.uni-kassel.de/viewer/!thumbs/1357143974502/' + str(page_number) + '/')
  text = r.content.decode('utf8')
  soup = BeautifulSoup(text, 'html.parser')
  imgs = soup.select('.view-thumbs__thumbnail')

  for idx, i in enumerate(imgs):
    print('   * fetching image idx', idx, 'on page')
    try:
      # download image
      src = i.find('img')['src']
      img_data = requests.get(src, allow_redirects=True).content
    
      # process metadata -- parse 'bibliography data' page
      href = i.find('a')['href'].replace("!image", "!metadata") + '-/'
      print(href)
      r = requests.get(href)
      soup = BeautifulSoup(r.text, 'html.parser')
      meta = deepcopy(collection_meta)
      data = soup.select('.metadata__element-value')
      title = data[1].get_text()
      shelf = data[2].get_text()
      origin = data[5].get_text()
      location = data[16].get_text()
      meta.update({
        'title': ' '.join(title.split()).strip(),
        'collection': ' '.join(shelf.split()).strip(),
        'location': ' '.join(location.split()).strip(),
        'permalink': href,
      })
      print(meta)
      # save records
      save_record('unikassel-versitat', img_data, meta)
      
    except Exception as exc:
      print(' ! err', i, exc)
  page_number += 1

# British Library Herbal Collection

In [40]:
collection_meta = deepcopy(base_meta)
collection_meta.update({
  'collection': 'British Library Herbal Collection',
})

# def get_image(url, timeout=5):
#   r = requests.get(url)
#   if '<title>429 Too Many Requests</title>' in r.text:
#     time.sleep(timeout)
#     return get_image(url, timeout=timeout+5)
#   return r.content
  
# Enter "illustrated herbal" into search box and search
driver = webdriver.Chrome()
driver.get("http://www.bl.uk/manuscripts/BriefDisplay.aspx?size=10")
time.sleep(3)
search = driver.find_element_by_id("ctl00_uiKeyword")
search.send_keys("illustrated herbal")
search.send_keys(Keys.ENTER)

for i in range(0, 31):
    # store link for next page
    nextPage = driver.find_element_by_xpath("//*[contains(text(), 'Next')]")
    links = [link.get_attribute('href') for link in driver.find_elements_by_xpath("//a[contains(@href,'FullDisplay')]")]
    # scrape
    for link in links:
        print(link)
    # go to next page
    nextPage.click()
    
# for idx, i in enumerate(range(1,79,1)): # page idx
#   url = 'https://www.bl.uk/collection-items?formats=manuscript&page={0}'.format(i)
#   soup = get_soup(url)
#   for jdx, j in enumerate(soup.select('.pnl-title a')): # manuscript idx
#     url = 'https://www.bl.uk' + j['href']
#     soup = get_soup(url)
#     try:
#       images_url = 'https://www.bl.uk' + soup.select('#view-image-button')[0]['href']
#     except:
#       print(' ! no image link', url)
#       continue
#     soup = get_soup(images_url)
#     try:
#       title = soup.find(id='full-title').get_text() # such a wonky api
#     except Exception as exc:
#       title = ''
#       print(' ! no title', url, exc, soup)
#     try:
#       text = soup.find_all('p')[1].get_text()
#     except Exception as exc:
#       text = ''
#       print(' ! no text', url, exc, soup)
    
#     for kdx, k in enumerate(soup.select('.img-viewer-thumbs img')):
#       try:
#         tail = '?'.join(k['src'].split('?')[:-1]) + '?w=500' # get images at desired width
#         url = 'https://www.bl.uk' + tail
#         img_data = get_image(url)
#         meta = deepcopy(collection_meta)
#         meta.update({
#           'title': title,
#           'text': text,
#           'permalink': url,
#         })
#         save_record('british-library', img_data, meta)
#       except Exception as exc:
#         print(' ! could not download image', url, traceback.format_exc().splitlines())

http://www.bl.uk/manuscripts/FullDisplay.aspx?index=0&ref=Add_MS_8101
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_8101&index=0
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=1&ref=Add_MS_8785
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_8785&index=1
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=2&ref=Add_MS_8881
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_8881&index=2
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=3&ref=Add_MS_8928
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_8928&index=3
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=4&ref=Add_MS_9398
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_9398&index=4
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=5&ref=Add_MS_9399
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_9399&index=5
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=6&ref=Add_MS_9401
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_9401&index=6
http://www.bl.uk/man

http://www.bl.uk/manuscripts/FullDisplay.aspx?index=60&ref=Add_MS_30024
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_30024&index=60
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=61&ref=Add_MS_32006
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_32006&index=61
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=62&ref=Add_MS_33733
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_33733&index=62
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=63&ref=Add_MS_35166
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_35166&index=63
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=64&ref=Add_MS_37049
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_37049&index=64
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=65&ref=Add_MS_37832
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_37832&index=65
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=66&ref=Add_MS_38118
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Add_MS_38118&i

http://www.bl.uk/manuscripts/FullDisplay.aspx?index=120&ref=Egerton_MS_608
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Egerton_MS_608&index=120
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=121&ref=Egerton_MS_745
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Egerton_MS_745&index=121
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=122&ref=Egerton_MS_747
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Egerton_MS_747&index=122
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=123&ref=Egerton_MS_809
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Egerton_MS_809&index=123
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=124&ref=Egerton_MS_821
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Egerton_MS_821&index=124
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=125&ref=Egerton_MS_874
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Egerton_MS_874&index=125
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=126&ref=Egerton_MS_943
http://www.bl.uk/manuscri

http://www.bl.uk/manuscripts/FullDisplay.aspx?index=180&ref=Harley_MS_4411
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Harley_MS_4411&index=180
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=181&ref=Harley_MS_4431
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Harley_MS_4431&index=181
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=182&ref=Harley_MS_4664
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Harley_MS_4664&index=182
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=183&ref=Harley_MS_4751
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Harley_MS_4751&index=183
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=184&ref=Harley_MS_4826
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Harley_MS_4826&index=184
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=185&ref=Harley_MS_4866
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Harley_MS_4866&index=185
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=186&ref=Harley_MS_4940
http://www.bl.uk/manuscri

http://www.bl.uk/manuscripts/FullDisplay.aspx?index=240&ref=Or_12570
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Or_12570&index=240
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=241&ref=Or_12857
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Or_12857&index=241
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=242&ref=Or_12885
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Or_12885&index=242
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=243&ref=Or_12897
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Or_12897&index=243
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=244&ref=Or_12909
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Or_12909&index=244
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=245&ref=Or_12988
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Or_12988&index=245
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=246&ref=Or_13130
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Or_13130&index=246
http://www.bl.uk/manuscripts/FullD

http://www.bl.uk/manuscripts/FullDisplay.aspx?index=300&ref=Sloane_MS_4016
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Sloane_MS_4016&index=300
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=301&ref=Stowe_MS_594
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Stowe_MS_594&index=301
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=302&ref=Yates_Thompson_MS_36
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Yates_Thompson_MS_36&index=302
http://www.bl.uk/manuscripts/FullDisplay.aspx?index=303&ref=Yates_Thompson_MS_47
http://www.bl.uk/manuscripts/FullDisplay.aspx?ref=Yates_Thompson_MS_47&index=303
