# Download Voynichese Images

In [None]:
from os.path import join
import requests
import os

def download_from_url(url, out_path):
  '''Download a file at location `url` and write to `out_path`'''
  if not os.path.exists(out_path):
    r = requests.get(url, allow_redirects=True)
    open(out_path, 'wb').write(r.content)

def download_voynichese_coords(page_id):
  '''Download the page coords for `page_id` from voynichese.com'''
  url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'
  download_from_url(url, join('voynichese', 'coords', page_id + '.js'))

def download_voynichese_page(page_id):
  '''Download a page image with page id `page_id` from voynichese.com'''
  url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'
  download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))

def download_voynichese_data(page_ids):
  '''Download page images and word coords from voynichese.com'''
  for i in ['coords', 'voynichese-images']:
    if not os.path.exists(join('voynichese', i)):
      os.makedirs(join('voynichese', i))
  for page_id in page_ids:
    download_voynichese_coords(page_id)
    download_voynichese_page(page_id)

download_voynichese_data(pages.keys())

# Download Biodiversity Heritage Library Images

In [None]:
from os.path import join
import flickr_api
import os

flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')
if os.path.exists('flickr.credentials'):
  flickr_api.set_auth_handler('flickr.credentials')
else:
  a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object
  perms = 'read' # set the required permissions
  url = a.get_authorization_url(perms)
  print(url) # open the printed url in a browser and agree; paste verifier code in xml response below

In [None]:
if not os.path.exists('flickr.credentials'):
  a.set_verifier('5b58510bb6f0641b')
  flickr_api.set_auth_handler(a)
  a.save('flickr.credentials')

In [None]:
user = flickr_api.Person.findByUserName('biodivlibrary')
errored = []
photos = user.getPhotos(page=2, perpage=100)
photos.info

In [None]:
import time

if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')

page_num = 6
while page_num < photos.info.pages:
  print(' * page num', page_num)
  page_photos = user.getPhotos(page=page_num, perpage=100)  
  for i in page_photos:
    try:
      out_path = join('biodivlibrary-images', i.id)
      if os.path.exists(out_path): continue
      i.save(out_path)
      time.sleep(2)
    except:
      print(' * could not save', i)
      errored.append(i)
  page_num += 1

# Download Smithsonian Images

In [None]:
from bs4 import BeautifulSoup
import requests
import os

def get_page_images(page_number=0):
  '''Save each herbal image in page number `page_number` to disk'''
  print(' * fetching page', page_number)
  r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))
  text = r.content.decode('utf8')
  soup = BeautifulSoup(text)
  imgs = soup.select('.dams-image')
  for i in imgs:
    try:
      src = i.find('img')['src']
      image_id = os.path.basename(src).split('id=')[1]
      download_image(image_id)
    except Exception as exc:
      print(' ! err', i, exc)
  if '/topic/botany/images?page=' + str(page_number+1) in text:
    get_page_images(page_number+1)
  
def download_image(_id):
  '''Download an image by SI image id'''
  r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)
  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)

# make output directory
out_dir = 'smithsonian-images'
if not os.path.exists(out_dir): os.makedirs(out_dir)

# fetch images
get_page_images()