In [18]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
import time
from PIL import Image

#url open to get image
import urllib.request
from urllib.request import urlopen

#get data in parallel
from multiprocessing import Pool

#plot (for image verification)
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [19]:
# Root directory of the project
ROOT_DIR = os.getcwd()
path_data = os.path.join(ROOT_DIR,'datasets/herpmapper')

In [20]:
PACKAGE_PARENT = '../..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from UTILS.utils import get_image

In [21]:
#Note this notebook can not be run in the same time as get_flickr_data on my computer, but it could on the cluster.
#Hence, depending on your computer you might or not be able to run both in the same time, but if you try verify its 
#blocked

# Download data

#### list of species and its synonymes

In [22]:
df_species_syn = pd.read_csv(os.path.join(ROOT_DIR, 'datasets/synonyms/df_species_syn_andrew.csv'), sep=',')
df_species_syn['li_synonyms_final'] = df_species_syn['li_synonyms_final'].map(lambda x: eval(x))
print(df_species_syn.shape)
df_species_syn.head(3)

(3730, 7)


Unnamed: 0,binomial,split2,split2_clean,li_synonyms,li_synonyms_clean,syn_equal_other_species,li_synonyms_final
0,Hydrophis peronii,"c(""Acalyptus Peronii"", ""? Acalyptus"", ""Acalypt...","['Acalyptophis peroni', 'Acalyptophis peronii'...","['Acalyptophis peroni', 'Acalyptophis peronii'...","['Acalyptophis peroni', 'Acalyptophis peronii'...",[],"[Acalyptophis peroni, Acalyptophis peronii, Ac..."
1,Acanthophis antarcticus,"c(""Boa antarctica"", ""Acanthophis cerastinus"", ...","['Vipera sorda', 'Acanthophis antarcticus', 'B...","['Vipera sorda', 'Boa ambigua', 'Boa antarctic...","['Vipera sorda', 'Boa ambigua', 'Boa antarctic...",[],"[Vipera sorda, Boa ambigua, Boa antarctica, Vi..."
2,Acanthophis hawkei,"c(""Acanthophis hawkei"", ""Acanthophis cummingi""...","['Acanthophis hawkei', 'Acanthophis cummingi']",['Acanthophis cummingi'],['Acanthophis cummingi'],[],[Acanthophis cummingi]


#### create list of species and their synonymes

In [23]:
dico_species_syn = dict(zip(df_species_syn['binomial'], df_species_syn['li_synonyms_final']))
li_species_word = []
for k,v in dico_species_syn.items():
    li_species_word.append(k)
    li_species_word.extend(v)
print('There is %d species-word in total to query for'%len(li_species_word))

There is 12425 species-word in total to query for


In [24]:
#dico ith keys a species word and value its associated species name
dico_speciesword_species = {}
for k,v in dico_species_syn.items():
    for i in v+[k]:
            dico_speciesword_species[i] = k

#### herpmapper info

In [25]:
#to get url to ask in hermapper
df_all = pd.read_csv(os.path.join(path_data,'herpmapper.csv'))
print(df_all.shape)
df_all.head(3)

(236944, 8)


Unnamed: 0,ID,Date,Time,Taxon,Country,Level 1,Level 2,Vouchers
0,HM 1,2006-11-08,13:42:00,Storeria dekayi,United States of America,Iowa,Johnson,https://www.herpmapper.org/content/voucher/0/1...
1,HM 2,2006-11-07,17:26:00,Lithobates pipiens,United States of America,Iowa,Benton,https://www.herpmapper.org/content/voucher/0/2...
2,HM 3,2006-11-05,11:55:00,Coluber constrictor flaviventris,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/4...


# Create a csv file with info of images

In [26]:
#removing if not in our species-word list
df = df_all[df_all['Taxon'].isin(li_species_word)]
print(df.shape)
#removing if no url
df = df[~df['Vouchers'].isnull()]
print(df.shape)
#add associated species name of the species word 
df['species_name'] = df['Taxon'].map(lambda x: dico_speciesword_species[x])
#creating alist of urls
df['url_list'] = df['Vouchers'].map(lambda x: x.split(','))
df.head(3)

(41093, 8)
(41090, 8)


Unnamed: 0,ID,Date,Time,Taxon,Country,Level 1,Level 2,Vouchers,species_name,url_list
0,HM 1,2006-11-08,13:42:00,Storeria dekayi,United States of America,Iowa,Johnson,https://www.herpmapper.org/content/voucher/0/1...,Storeria dekayi,[https://www.herpmapper.org/content/voucher/0/...
4,HM 5,2006-11-04,12:55:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...,Storeria dekayi,[https://www.herpmapper.org/content/voucher/0/...
5,HM 6,2006-10-30,09:05:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...,Storeria dekayi,[https://www.herpmapper.org/content/voucher/0/...


In [27]:
li_species_herpmapper = list(set(df['species_name'].tolist()))
print('There is %d species-name with at least one image in HerpMapper'%len(li_species_herpmapper))

There is 993 species-name with at least one image in HerpMapper


In [28]:
#create a df with one line per image instead
df = df.set_index(['Date', 'Level 1', 'Level 2', 'Time', 'Country', 'Taxon','ID','species_name'])['url_list'].apply(pd.Series).stack()
df = df.reset_index()
df = df.rename(columns={0:'url'})
#create a single id per image based on the info of herpmapper that is not supposed to changed
#TODO: be verified by Andrew if this id is indeed unique (herpmapper should know)
df['id'] = df['url'].map(lambda x: '-'.join(x.split('.')[-2].split('/')[-2:]))
df['img_path'] = df.apply(lambda x: os.path.join(path_data,'datasets','herpmapper',x['Taxon'],
                                                       'herpmapper_'+x['Taxon']+'_'+x['id']+".png"), axis=1)
df['saved_img_id'] = df['img_path'].map(lambda x: x.split('/')[-1])
print('There is %d images to download from Herpmapper'%df.shape[0])
#we gain 558 image susing all synonymes ending up to 53389 images
df.head(3)

There is 53389 images to download from Herpmapper


Unnamed: 0,Date,Level 1,Level 2,Time,Country,Taxon,ID,species_name,level_8,url,id,img_path,saved_img_id
0,2006-11-08,Iowa,Johnson,13:42:00,United States of America,Storeria dekayi,HM 1,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-1,/home/camille/vm_exchange/Lab/snakes/datasets/...,herpmapper_Storeria dekayi_0-1.png
1,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-13,/home/camille/vm_exchange/Lab/snakes/datasets/...,herpmapper_Storeria dekayi_0-13.png
2,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,1,https://www.herpmapper.org/content/voucher/0/...,0-14,/home/camille/vm_exchange/Lab/snakes/datasets/...,herpmapper_Storeria dekayi_0-14.png


In [29]:
if len(df['id'].tolist())!=len(set(df['id'].tolist())):
    print('ERROR: not unique ids of herpmapper images')
    sys.exit()

# Download Herpmapper images

#in case to erase all (not herpmapper.csv data given by Andrew)
for species in glob.glob(os.path.join(path_data,'*')):
    if not species.endswith('.csv'):
        shutil.rmtree(species)

In [30]:
#create appropriate folders if not existing
for species in li_species_herpmapper:
    folder_path_s = os.path.join(path_data,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)      

In [31]:
def download_herpmapper_image(i):
    
    #download image and save it
    species = df.iloc[i]['species_name']
    f = os.path.join(path_data, species,'herpmapper_'+species+'_'+df.iloc[i]['id']+".png")
    get_image(url=df.iloc[i]['url'], path=f, name=species)

In [32]:
#compute time needed to gather all images
start = time.time()
with Pool(80) as p:
    p.map(download_herpmapper_image, range(df.shape[0]))
end = time.time()
print ("Total running time: ", (end-start)/60) #in minutes

Not able to SAVE image for species Crotalus atrox and url  https://www.herpmapper.org/content/voucher/6/60465.mp3,                     due to: cannot identify image file <_io.BytesIO object at 0x7f2cc57f8f68> ,  lets STOP
200


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



# Verify error in download

#indeed url can not be found
url = df[(df['Taxon']=='Opheodrys vernalis')&(df['id']=='6-63743')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

#indeed: its a video
url = df[(df['Taxon']=='Crotalus atrox')&(df['id']=='6-60465')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

#indeed url can not be found
url = df[(df['Taxon']=='Pantherophis vulpinus')&(df['id']=='26-260082')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

# Check amount of Herpmapper images we have

In [33]:
#choose appropriate genus for search
li_images_hm = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_images_hm.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')]) 
print('We have %d images collected from Herpmapper'%len(li_images_hm))

We have 36320 images collected from Herpmapper


# save metadata info (only collected images)

In [34]:
x1 = df.shape[0]
df = df[df['img_path'].isin(li_images_hm)]
print('We lost %.2f percent of images (%d images)'%((x1-len(li_images_hm))/x1*100,x1-len(li_images_hm)))
df.to_csv(os.path.join(path_data,'herpmapper_image_info.csv'), index=False, sep=';')

We lost 31.97 percent of images (17069 images)
