In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
import time
from PIL import Image
from collections import Counter
import pickle
from datetime import datetime

#url open to get image
import urllib.request
from urllib.request import urlopen

#get data in parallel
from multiprocessing import Pool

#plot (for image verification)
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_data = os.path.join(ROOT_DIR,'datasets/herpmapper')

In [3]:
PACKAGE_PARENT = '../../..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from UTILS.utils import get_image

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
#Note this notebook can not be run in the same time as get_flickr_data on my computer, but it could on the cluster.
#Hence, depending on your computer you might or not be able to run both in the same time, but if you try verify its 
#blocked

# Download data

### list of species and its synonymes

In [5]:
#dowload dictionary with keys=species, value=list of its synonyms
dico_syn = pickle.load(open(os.path.join(ROOT_DIR,'datasets','synonyms','dico_species_lisyn.pkl'), 'rb'))

#### create list of species and their synonymes

In [6]:
li_species_word = []
for k,v in dico_syn.items():
    li_species_word.append(k)
    li_species_word.extend(v)
print('There is %d species-word in total to query for'%len(li_species_word))

There is 18289 species-word in total to query for


In [7]:
#dico with keys = species word and value its associated species name
dico_speciesword_species = {}
for k,v in dico_syn.items():
    for i in v+[k]:
            dico_speciesword_species[i] = k

### herpmapper info

In [8]:
#to get url to ask in hermapper
df_all = pd.read_csv(os.path.join(path_data,'herpmapper.csv'))
print(df_all.shape)
df_all

(236944, 8)


Unnamed: 0,ID,Date,Time,Taxon,Country,Level 1,Level 2,Vouchers
0,HM 1,2006-11-08,13:42:00,Storeria dekayi,United States of America,Iowa,Johnson,https://www.herpmapper.org/content/voucher/0/1...
1,HM 2,2006-11-07,17:26:00,Lithobates pipiens,United States of America,Iowa,Benton,https://www.herpmapper.org/content/voucher/0/2...
2,HM 3,2006-11-05,11:55:00,Coluber constrictor flaviventris,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/4...
3,HM 4,2006-11-05,12:58:00,Acris blanchardi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...
4,HM 5,2006-11-04,12:55:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...
5,HM 6,2006-10-30,09:05:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...
6,HM 7,2006-11-08,13:50:00,Thamnophis sirtalis sirtalis,United States of America,Iowa,Johnson,https://www.herpmapper.org/content/voucher/0/1...
7,HM 8,2006-10-29,12:35:00,Pantherophis vulpinus,United States of America,Illinois,Whiteside,https://www.herpmapper.org/content/voucher/0/1...
8,HM 9,2006-10-29,13:17:00,Terrapene ornata ornata,United States of America,Illinois,Carroll,https://www.herpmapper.org/content/voucher/0/2...
9,HM 10,2006-10-29,11:00:00,Thamnophis radix,United States of America,Illinois,Henry,https://www.herpmapper.org/content/voucher/0/2...


#### preprocessing

In [9]:
#removing if not in our species-word list
df = df_all[df_all['Taxon'].isin(li_species_word)]
print(df.shape)
#removing if no url
df = df[~df['Vouchers'].isnull()]
print(df.shape)
#add associated species name of the species word
df.rename(columns={'Taxon':'species_word'},inplace=True)
df['species'] = df['species_word'].map(lambda x: dico_speciesword_species[x])
#creating alist of urls
df['url_list'] = df['Vouchers'].map(lambda x: x.split(','))
df.head(3)

(45643, 8)
(45640, 8)


Unnamed: 0,ID,Date,Time,species_word,Country,Level 1,Level 2,Vouchers,species,url_list
0,HM 1,2006-11-08,13:42:00,Storeria dekayi,United States of America,Iowa,Johnson,https://www.herpmapper.org/content/voucher/0/1...,Storeria dekayi,[https://www.herpmapper.org/content/voucher/0/...
4,HM 5,2006-11-04,12:55:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...,Storeria dekayi,[https://www.herpmapper.org/content/voucher/0/...
5,HM 6,2006-10-30,09:05:00,Storeria dekayi,United States of America,Iowa,Linn,https://www.herpmapper.org/content/voucher/0/1...,Storeria dekayi,[https://www.herpmapper.org/content/voucher/0/...


In [10]:
li_species_herpmapper = list(set(df['species'].tolist()))
print('There is %d species with at least one image in HerpMapper'%len(li_species_herpmapper))

There is 998 species with at least one image in HerpMapper


In [11]:
#create a df with one line per image instead
df = df.set_index(['Date', 'Level 1', 'Level 2', 'Time', 'Country', 'species_word','ID',
                   'species'])['url_list'].apply(pd.Series).stack()
df = df.reset_index()
df = df.rename(columns={0:'url'})
#create a single id per image based on the info of herpmapper that is not supposed to changed
#TODO: be verified by Andrew if this id is indeed unique (herpmapper should know)
df['id'] = df['url'].map(lambda x: '-'.join(x.split('.')[-2].split('/')[-2:]))
df['saved_img_id'] = df.apply(lambda x: 'herpmapper_'+x['species']+'_'+x['id']+".png",axis=1)
df['img_path'] = df.apply(lambda x: os.path.join(path_data,x['species'], x['saved_img_id']), axis=1)
print('There is %d images to download from Herpmapper'%df.shape[0])
#we gain 558 images using all synonymes ending up to 53389 images. then we ended up to 59478 using all language
df.head(3)

There is 59478 images to download from Herpmapper


Unnamed: 0,Date,Level 1,Level 2,Time,Country,species_word,ID,species,level_8,url,id,saved_img_id,img_path
0,2006-11-08,Iowa,Johnson,13:42:00,United States of America,Storeria dekayi,HM 1,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-1,herpmapper_Storeria dekayi_0-1.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
1,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-13,herpmapper_Storeria dekayi_0-13.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
2,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,1,https://www.herpmapper.org/content/voucher/0/...,0-14,herpmapper_Storeria dekayi_0-14.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


In [12]:
#make a good date puting to 1000-01-01 all wrong dates
df['Date'] = df['Date'].map(lambda x: '1000-01-01' if x.split('-')[-1]=='00' else x)
df['Date'] = df['Date'].map(lambda x: '1000-01-01' if x.split('-')[1]=='00' else x)
df['Date'] = df['Date'].map(lambda x: '1000-01-01' if x.split('-')[0]=='0000' else x)
df['Time'] = df['Time'].fillna('00:00:00')
df['datetaken'] = df.apply(lambda x: datetime(datetime.strptime(x['Date'], "%Y-%m-%d").year, 
                                              datetime.strptime(x['Date'], "%Y-%m-%d").month, 
                                              datetime.strptime(x['Date'], "%Y-%m-%d").day, 
                                              int(x['Time'].split(':')[0]), 
                                              int(x['Time'].split(':')[1]),
                                              int(x['Time'].split(':')[2])), axis=1)
#small verification
#df[['observed_on','time_observed_at','observed_on_string','datetaken']]

In [14]:
if len(df['id'].tolist())!=len(set(df['id'].tolist())):
    print('ERROR: not unique ids of herpmapper images')
    sys.exit()

# Download Herpmapper images

#in case to erase all (not herpmapper.csv data given by Andrew)
for species in glob.glob(os.path.join(path_data,'*')):
    if not species.endswith('.csv'):
        shutil.rmtree(species)

In [15]:
#create appropriate folders if not existing
for species in li_species_herpmapper:
    folder_path_s = os.path.join(path_data,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)      

In [16]:
def download_image(i):
    #download image and save it
    species = df.iloc[i]['species']
    f = os.path.join(path_data, species,'herpmapper_'+species+'_'+df.iloc[i]['id']+".png")
    get_image(url=df.iloc[i]['url'], path=f, name=df.iloc[i]['species_word'])

In [17]:
#compute time needed to gather all images
start = time.time()
with Pool(3) as p:
    p.map(download_image, range(df.shape[0]))
end = time.time()
print ("Total running time: ", (end-start)/60) #in minutes

Not able to SAVE image for species Crotalus atrox and url  https://www.herpmapper.org/content/voucher/6/60465.mp3, lets STOP due to: 
 cannot identify image file <_io.BytesIO object at 0x7fbc973fd1a8>
200
Not able to SAVE image for species Opheodrys vernalis and url https://www.herpmapper.org/content/voucher/6/63743.jpg, lets STOP due to: 
 HTTP Error 404: Not Found
Not able to SAVE image for species Pantherophis vulpinus and url https://www.herpmapper.org/content/voucher/26/260082.jpg, lets STOP due to: 
 HTTP Error 404: Not Found
Total running time:  2.643633194764455


# Verify error in download

#indeed url can not be found
url = df[(df['species']=='Opheodrys vernalis')&(df['id']=='6-63743')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

#indeed: its a video
url = df[(df['species']=='Crotalus atrox')&(df['id']=='6-60465')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

#indeed url can not be found
url = df[(df['species']=='Pantherophis vulpinus')&(df['id']=='26-260082')]['url'].values[0]
print(url)
img = Image.open(urlopen(url))
plt.imshow(img);

# Check amount of Herpmapper images we have

In [18]:
#choose appropriate genus for search
li_images_hm = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_images_hm.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')]) 
print('We have %d images collected from Herpmapper'%len(li_images_hm))

We have 59475 images collected from Herpmapper


# save metadata info (only collected images)

In [19]:
#in case, remove images in several species (should not happen in inaturalist)
li_test = []
for i in li_images_hm:
    li_test.append(i.split('/')[-1].split('_')[-1].split('.')[0])
c = Counter(li_test)
c = {k:v for k,v in c.items() if v>1}
id_to_be_removed = list(set(c.keys()))
if len(id_to_be_removed)>0:
    print('ERROR: image sin different species!! probably 1 species-word was associated to several species over time')
    sys.exit()
#delete them after analysing why
#for i in li_images_hm:
#    if i.split('/')[-1].split('_')[-1].split('.')[0] in id_to_be_removed:
#        #delete file
#        os.remove(i)

In [20]:
x1 = df.shape[0]
df = df[df['img_path'].isin(li_images_hm)]
print('We lost %.2f percent of images (%d images)'%((x1-len(li_images_hm))/x1*100,x1-len(li_images_hm)))
df.to_csv(os.path.join(path_data,'herpmapper_image_info.csv'), index=False, sep=';')
print(df.shape)
df.head(3)

We lost 0.01 percent of images (3 images)
(59475, 14)


Unnamed: 0,Date,Level 1,Level 2,Time,Country,species_word,ID,species,level_8,url,id,saved_img_id,img_path,datetaken
0,2006-11-08,Iowa,Johnson,13:42:00,United States of America,Storeria dekayi,HM 1,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-1,herpmapper_Storeria dekayi_0-1.png,/home/camille/vm_exchange/Lab/snakes/datasets/...,2006-11-08 13:42:00
1,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-13,herpmapper_Storeria dekayi_0-13.png,/home/camille/vm_exchange/Lab/snakes/datasets/...,2006-11-04 12:55:00
2,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,1,https://www.herpmapper.org/content/voucher/0/...,0-14,herpmapper_Storeria dekayi_0-14.png,/home/camille/vm_exchange/Lab/snakes/datasets/...,2006-11-04 12:55:00
