In [None]:
from copy import deepcopy

# base metadata object to be updated on a per collection then per image basis
meta = {
  'image': '',
  'collection': '',
  'title': '',
  'artist': '',
  'date': '',
  'text': '',
  'location': '',
  'permalink': '',
}

# Download Voynichese Images

The following downloads the images and metadata for a dataset of inputs to the Neural Neighbors web application.

Metadata for each image should ideally posess the following fields:

| Image | Collection | Title | Artist | Date | Text | Location | Permalink |

In [None]:
from os.path import join
import requests
import os

from helpers import pages

def download_from_url(url, out_path):
  '''Download a file at location `url` and write to `out_path`'''
  if not os.path.exists(out_path):
    r = requests.get(url, allow_redirects=True)
    open(out_path, 'wb').write(r.content)

def download_voynichese_coords(page_id):
  '''Download the page coords for `page_id` from voynichese.com'''
  url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'
  download_from_url(url, join('voynichese', 'coords', page_id + '.js'))

def download_voynichese_page(page_id):
  '''Download a page image with page id `page_id` from voynichese.com'''
  url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'
  download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))

def download_voynichese_data(page_ids):
  '''Download page images and word coords from voynichese.com'''
  for i in ['coords', 'images']:
    if not os.path.exists(join('voynichese', i)):
      os.makedirs(join('voynichese', i))
  for page_id in page_ids:
    download_voynichese_coords(page_id)
    download_voynichese_page(page_id)

print(' * preparing to download', sorted(pages.keys()))
download_voynichese_data(pages.keys())

# Download Biodiversity Heritage Library Images

In [None]:
from copy import deepcopy
from os.path import join
import flickr_api # use python 2 kernel
import os

root_out_dir = 'biodivlibrary'
for i in ['images', 'metadata']:
  out_dir = os.path.join(root_out_dir, i)
  if not os.path.exists(out_dir):
    os.makedirs(out_dir)

collection_meta = deepcopy(meta)
collection_meta.update({
  'collection': 'Biodiversity Heritage Library',
  'permalink': 'https://www.flickr.com/photos/biodivlibrary/'
})

# configure api keys
flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')
if os.path.exists('flickr.credentials'):
  flickr_api.set_auth_handler('flickr.credentials')
else:
  a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object
  perms = 'read' # set the required permissions
  url = a.get_authorization_url(perms)
  print(url) # open the printed url in a browser and agree; paste verifier code in xml response below
  
if not os.path.exists('flickr.credentials'):
  a.set_verifier('5b58510bb6f0641b')
  flickr_api.set_auth_handler(a)
  a.save('flickr.credentials')
  
# run initial query
user = flickr_api.Person.findByUserName('biodivlibrary')
errored = []
photos = user.getPhotos(page=2, perpage=100)

In [None]:
import time, traceback, json

page_num = 0
while page_num < photos.info.pages:
  print(' * page num', page_num)
  page_photos = user.getPhotos(page=page_num, perpage=100)  
  for i in page_photos:
    try:
      # save photo
      out_path = join(root_out_dir, 'images', i.id + '.jpg')
      print(' * saving', out_path)
      i.save(out_path)
      time.sleep(2)
      # save meta
      info = i.getInfo() # url
      url = info.get('urls', {}).get('url', [])
      url = url[0].get('text', collection_meta['permalink']) if url else collection_meta['permalink']
      meta = deepcopy(collection_meta)
      meta.update({
        'permalink': url,
        'title': info.get('title', ''),
        'description': info.get('description', ''),
        'tags': [i.text for i in info.get('tags', [])],
        'views': info.get('views', ''),
      })
      out_path = join(root_out_dir, 'metadata', i.id + '.json')
      with open(out_path, 'w') as out:
        json.dump(meta, out)
      
    except Exception as exc:
      print(' * could not save', i, '\n', traceback.format_exc().splitlines())
      errored.append(i)
  page_num += 1

# Download Smithsonian Images

In [None]:
from bs4 import BeautifulSoup
import requests
import os

def get_page_images(page_number=0):
  '''Save each herbal image in page number `page_number` to disk'''
  print(' * fetching page', page_number)
  r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))
  text = r.content.decode('utf8')
  soup = BeautifulSoup(text)
  imgs = soup.select('.dams-image')
  for i in imgs:
    try:
      src = i.find('img')['src']
      image_id = os.path.basename(src).split('id=')[1]
      download_image(image_id)
    except Exception as exc:
      print(' ! err', i, exc)
  if '/topic/botany/images?page=' + str(page_number+1) in text:
    get_page_images(page_number+1)
  
def download_image(_id):
  '''Download an image by SI image id'''
  r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)
  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)

# make output directory
out_dir = 'smithsonian-images'
if not os.path.exists(out_dir): os.makedirs(out_dir)

# get images
get_page_images()

In [None]:
collection_meta

# Download Bodleian Images

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import requests
import os

def download_query_results():
  driver = webdriver.Chrome()
  driver.get(url)
  time.sleep(2)
  text = driver.page_source
  soup = BeautifulSoup(text)
  collections = soup.select('.result')
  print(' * got', len(collections), 'collections')
  for i in collections:
    children = i['data-children'].strip().split(',')
    for j in children:
      try:
        download_image(j)
      except Exception as exc:
        print(' ! err', j, exc)
    
def download_image(_id):
  print(' * downloading', _id)
  r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))
  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)
  
# make output directory
out_dir = 'bodleian-images'
if not os.path.exists(out_dir): os.makedirs(out_dir)

# get images
root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'
url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'

download_query_results()

# Download Padova Images

In [None]:
import requests, os

out_dir = 'padova'
if not os.path.exists(out_dir):
  os.makedirs(out_dir)

for i in ['recto', 'verso']:
  for j in range(1,28,1):
    num = str(j)
    while len(num) < 3:
      num = '0' + num
    img = '{0}-{1}.jpg'.format(num, i)
    url = 'https://medicaltraditions.org/images/stories/manuscripts/demateriamedica/{0}'.format(img)
    r = requests.get(url)
    open(os.path.join(out_dir, img), 'wb').write(r.content)

# Download Schoenberg Images

In [None]:
import requests, os

for book_id in ['ljs419', 'ljs062']:
  
  out_dir = os.path.join('schoenberg', book_id)
  if not os.path.exists(out_dir):
    os.makedirs(out_dir)
  
  for i in range(500):
    idx = str(i)
    while len(idx) < 4: idx = '0' + idx
    for j in ['front', 'body']:
      try:
        img = '{0}_{1}{2}'.format(book_id, j, idx)
        url = 'http://images.library.upenn.edu/mrsidsceti/bin/image_jpeg.pl?coll=schoenberg&subcoll={0}&image={1}.sid&level=2'.format(book_id, img)
        r = requests.get(url)
        print(url, len(r.content))
        if len(r.content) > 1000:
          open(os.path.join(out_dir, img + '.jpg'), 'wb').write(r.content)
      except:
        print(' ! could not fetch page', url)

# Bax - Italian Herbal

In [None]:
from bs4 import BeautifulSoup

out_dir = 'bax/italian-herbal/'
if not os.path.exists(out_dir):
  os.makedirs(out_dir)

# system uses Islandora and given a list of thumbnails like:
# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages
# one can transform each thumbnail url from
# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/TN/view
# to another DATASTREAM in Islandora https://wiki.duraspace.org/display/ISLANDORA/APPENDIX+C+-+DATASTREAM+REFERENCE
# e.g. http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/JPG/view
for page_idx, page in enumerate(range(1, 12, 1)):
  url = 'http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages?page={0}'.format(page)
  html = requests.get(url).text
  soup = BeautifulSoup(html)
  for idx, i in enumerate(soup.select('.islandora-objects-grid')[0].select('img')):
    src = i['src']
    url = src.replace('/TN/', '/JPG/')
    r = requests.get(url)
    img = '{0}-{1}.jpg'.format(page_idx, idx)
    open(os.path.join(out_dir, img), 'wb').write(r.content)

# Bax - General History of the Things of New Spain
Sahagun

In [None]:
import os, requests

out_dir = 'bax/sahagun/'
if not os.path.exists(out_dir):
  os.makedirs(out_dir)
  
for i in range(1000):
  url = 'https://content.wdl.org/10622/service/thumbnail/1403114302/1024x1024/1/{0}.jpg'.format(i)
  r = requests.get(url)
  if len(r.content) > 500:
    img = str(i) + '.jpg'
    open(os.path.join(out_dir, img), 'wb').write(r.content)

# Bax - Kitab
ManuscritKitāb 'ağā'ib al-maḫlūqāt wa ġarā'ib ... Qazwīnī, Zakariyyā ibn Muḥammad ibn Maḥmūd al- (1203-1283). Auteur du texte

In [None]:
import os, requests

out_dir = 'bax/kitab/'
if not os.path.exists(out_dir):
  os.makedirs(out_dir)

for i in range(1000):
  url = 'https://gallica.bnf.fr/ark:/12148/btv1b8406160j/f{0}.medres'.format(i)
  r = requests.get(url)
  if len(r.content) > 500:
    img = str(i) + '.jpg'
    open(os.path.join(out_dir, img), 'wb').write(r.content)

# MS. Canon. Misc. 408

In [None]:
# url https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/full/full/0/default.jpg
# info https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/info.json

# Hunt Images

In [None]:
from bs4 import BeautifulSoup
import os, requests

out_dir = 'hunt'
if not os.path.exists(out_dir):
  os.makedirs(out_dir)

for i in range(1,185,1):
  url = 'http://huntbot.org/artdb/art-collection-search?page={0}'.format(i)
  html = requests.get(url).text
  soup = BeautifulSoup(html)
  for idx, j in enumerate(soup.select('.views-field img')):
    url = j['src']
    r = requests.get(url)
    if len(r.content) > 500:
      img = '{0}-{1}.jpg'.format(i, idx)
      open(os.path.join(out_dir, img), 'wb').write(r.content)

# Morgan Images

Good candidate for discussing data "scraping" with student

In [None]:
from bs4 import BeautifulSoup
import os, requests

out_dir = 'morgan'
if not os.path.exists(out_dir):
  os.makedirs(out_dir)

root_url = 'http://ica.themorgan.org/'
for i in range(0,4,1): # page idx
  url = 'https://www.themorgan.org/manuscripts/list?page={0}'.format(i)
  html = requests.get(url).text
  soup = BeautifulSoup(html)
  for idx, j in enumerate(soup.select('.views-field-field-collection-images-link a')): # manuscript idx on page
    url = j['href']
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    for kdx, k in enumerate(soup.select('.starter-template img')): # image idx in manuscript
      url = root_url + k['src'].replace('../', '')
      try:
        r = requests.get(url)
        if len(r.content) > 500:
          img = '{0}-{1}.jpg'.format(idx, kdx)
          open(os.path.join(out_dir, img), 'wb').write(r.content)
      except Exception as exc:
        print(' ! trouble with', idx, kdx)