In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
import time
from PIL import Image

#url open to get image
import urllib.request
from urllib.request import urlopen

#get data in parallel
from multiprocessing import Pool

#plot (for image verification)
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_ml = os.path.join(ROOT_DIR,'SNAKES')
folder_path = os.path.join(path_ml,'datasets','herpmapper_images')

# download data

In [3]:
# TODO: ask Andrew for a list of all species, its better so that we dont rely on the folder!

In [4]:
#produce a list of species to search for
li_species = []
for genus in glob.glob(os.path.join(path_ml,'datasets','snapp_images','*')):
    for species in glob.glob(os.path.join(genus,'*')):
        li_species.append(species.split('/')[-1])
print('We have in total %d species'%len(li_species))

We have in total 3700 species


In [5]:
#to get url to ask in hermaper
df_all = pd.read_csv(os.path.join(folder_path,'herpmapper.csv'))
print(df_all.shape)
df_all.head(3)

(236944, 8)


Unnamed: 0,ID,Date,Time,Taxon,Country,Level 1,Level 2,Vouchers
0,HM 1,2006-11-08,13:42:00,Storeria dekayi,United States of America,Iowa,Johnson,https://www.herpmapper.org/content/voucher/0/1...
1,HM 2,2006-11-07,17:26:00,Lithobates pipiens,United States of America,Iowa,Benton,https://www.herpmapper.org/content/voucher/0/2...
2,HM 3,2006-11-05,11:55:00,Coluber constrictor flaviventris,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/4...


# create a csv file with info of images

In [6]:
df = df_all[df_all['Taxon'].isin(li_species)]
print(df.shape)
df = df[~df['Vouchers'].isnull()]
print(df.shape)
df['url_list'] = df['Vouchers'].map(lambda x: x.split(','))
df.head(3)

(40635, 8)
(40632, 8)


Unnamed: 0,ID,Date,Time,Taxon,Country,Level 1,Level 2,Vouchers,url_list
0,HM 1,2006-11-08,13:42:00,Storeria dekayi,United States of America,Iowa,Johnson,https://www.herpmapper.org/content/voucher/0/1...,[https://www.herpmapper.org/content/voucher/0/...
4,HM 5,2006-11-04,12:55:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...,[https://www.herpmapper.org/content/voucher/0/...
5,HM 6,2006-10-30,09:05:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...,[https://www.herpmapper.org/content/voucher/0/...


In [7]:
li_species_herpmapper = list(set(df['Taxon'].tolist()))
print('There is %d species with at least one image in HerpMapper'%len(li_species_herpmapper))

There is 946 species with at least one image in HerpMapper


In [8]:
#one line per image
df = df.set_index(['Date', 'Level 1', 'Level 2', 'Time', 'Taxon','ID'])['url_list'].apply(pd.Series).stack()
df = df.reset_index()
df = df.rename(columns={0:'url'})
#create a single id per image based on the info of herpmapper that is not supposed to changed
#TODO: be verified by Andrew if this id is indeed unique (herpmapper should know)
df['id'] = df['url'].map(lambda x: '-'.join(x.split('.')[-2].split('/')[-2:]))
df['img_path'] = df.apply(lambda x: os.path.join(path_ml,'datasets','herpmapper_images',x['Taxon'],
                                                       x['Taxon']+'_'+x['id']+".png"), axis=1)
print('There is %d images to download from Herpmapper'%df.shape[0])
df.head(3)

There is 52831 images to download from Herpmapper


Unnamed: 0,Date,Level 1,Level 2,Time,Taxon,ID,level_6,url,id,img_path
0,2006-11-08,Iowa,Johnson,13:42:00,Storeria dekayi,HM 1,0,https://www.herpmapper.org/content/voucher/0/1...,0-1,/mount/SDB/camille-secure/SNAKES/datasets/herp...
1,2006-11-04,Iowa,Linn,12:55:00,Storeria dekayi,HM 5,0,https://www.herpmapper.org/content/voucher/0/1...,0-13,/mount/SDB/camille-secure/SNAKES/datasets/herp...
2,2006-11-04,Iowa,Linn,12:55:00,Storeria dekayi,HM 5,1,https://www.herpmapper.org/content/voucher/0/...,0-14,/mount/SDB/camille-secure/SNAKES/datasets/herp...


# download image from HerpMapper

In [9]:
#in case to erase all
#shutil.rmtree(os.path.join(path_ml,'datasets','herpmapper_images_old'))

In [10]:
#create appropriate folders if not existing
for species in li_species_herpmapper:
    folder_path_s = os.path.join(folder_path,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)      

In [11]:
def download_herpmapper_image(i):
    #download image and save it
    species = df.iloc[i]['Taxon']
    try:
        img = Image.open(urlopen(df.iloc[i]['url']))
        img.save(os.path.join(os.path.join(folder_path, species),
                              species+'_'+df.iloc[i]['id']+".png"))
    except KeyboardInterrupt:
            raise
    except Exception as e:
        print('Not able to DOWNLOAD herpmapper image for species %s with id %s : '%(species, df.iloc[i]['id']))
        print(e)
        print()

#compute time needed to gather all images
start = time.time()
with Pool(500) as p:
    p.map(download_herpmapper_image, range(df.shape[0]))
end = time.time()
print ("Total running time: ", end-start) #in seconds

Not able to DOWNLOAD herpmapper image for species Crotalus atrox with id 6-60465 : 
cannot identify image file <_io.BytesIO object at 0x7fecddbec620>

Not able to DOWNLOAD herpmapper image for species Crotalus horridus with id 2-21098 : 
HTTP Error 504: Gateway Time-out

Not able to DOWNLOAD herpmapper image for species Opheodrys vernalis with id 6-63743 : 
HTTP Error 404: Not Found

Not able to DOWNLOAD herpmapper image for species Pantherophis vulpinus with id 26-260082 : 
HTTP Error 404: Not Found

Total running time:  1218.6887047290802


# verify error in download

#indeed url can not be found
url = df[(df['Taxon']=='Opheodrys vernalis')&(df['id']=='6-63743')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

#indeed: its a video
url = df[(df['Taxon']=='Crotalus atrox')&(df['id']=='6-60465')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

#indeed url can not be found
url = df[(df['Taxon']=='Pantherophis vulpinus')&(df['id']=='26-260082')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

In [14]:
#the "HTTP Error 504: Gateway Time-out" error might appear, in this case run the code another day you might be luckier

# Check amount of Herpmapper images we have

In [12]:
#create appropriate folder if not existing
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
#choose appropriate genus for search
li_images_hm = []
for species in glob.glob(os.path.join(folder_path,'*')):
    li_images_hm.extend([x for x in glob.glob(os.path.join(species,'*')) if not x.endswith('.csv')]) 
print('We have %d images collected from Herpmapper'%len(li_images_hm))

We have 52827 images collected from Herpmapper


In [13]:
#save metadata info (only collected images)
x1 = df.shape[0]
df = df[df['img_path'].isin(li_images_hm)]
print('We lost %.2f percent of images (%d images)'%((x1-len(li_images_hm))/x1*100,x1-len(li_images_hm)))
df.to_csv(os.path.join(folder_path,'herpmapper_image_info.csv'), index=False, sep=';')

We lost 0.01 percent of images (4 images)
