In [1]:
#basic package
import json
import tqdm
import os
import numpy as np
import glob
import pandas as pd
import sys
import shutil
import time
from PIL import Image
import pickle
import collections
from collections import Counter

#url open to get image
import urllib.request
from urllib.request import urlopen

#get data in parallel
from multiprocessing import Pool

#plot (for image verification)
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_data = os.path.join(ROOT_DIR,'datasets/inaturalist')

In [3]:
PACKAGE_PARENT = '../../..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from UTILS.utils import get_image

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
#Note this notebook can not be run in the same time as get_flickr_data on my computer, but it could on the cluster.
#Hence, depending on your computer you might or not be able to run both in the same time, but if you try verify its 
#blocked

# Download data

### list of species and its synonymes

In [5]:
#dowload dictionary with keys=species, value=list of its synonyms
dico_syn = pickle.load(open(os.path.join(ROOT_DIR,'datasets','synonyms','dico_species_lisyn.pkl'), 'rb'))

#### create list of species and their synonymes

In [6]:
li_species_word = []
for k,v in dico_syn.items():
    li_species_word.append(k)
    li_species_word.extend(v)
print('There is %d species-word in total to query for'%len(li_species_word))

There is 18289 species-word in total to query for


In [7]:
#dico ith keys a species word and value its associated species name
dico_speciesword_species = {}
for k,v in dico_syn.items():
    for i in v+[k]:
            dico_speciesword_species[i] = k

### inaturalist info

In [8]:
#to get url 
df_all= pd.read_csv(os.path.join(path_data, 'observations-39577.csv'), sep=',', parse_dates=['observed_on',
                                                                                          'time_observed_at'])
print(df_all.shape)
df_all.head(3)

(145481, 35)


Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,out_of_range,user_id,user_login,created_at,updated_at,...,positional_accuracy,geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id
0,122,"September 10, 2007 12:00",2007-09-10,2007-09-10 19:00:00,Pacific Time (US & Canada),,15,gdurkee,2008-04-09 11:57:16 -0500,2017-07-31 03:48:31 -0500,...,,,False,,,Mountain Garter Snake,Thamnophis elegans elegans,Mountain Garter Snake,Reptilia,28400
1,369,5/26/07,2007-05-26,NaT,Eastern Time (US & Canada),,72,stellaoleary,2008-06-05 14:04:53 -0500,2014-10-19 22:06:17 -0500,...,,,False,,,black rat snake,Pantherophis alleghaniensis,Eastern Rat Snake,Reptilia,59644
2,502,07/12/08,2008-07-12,NaT,Eastern Time (US & Canada),,19,justinscioli,2008-07-12 12:18:20 -0500,2015-11-07 00:16:04 -0600,...,,,False,,,Northern Water Snake,Nerodia sipedon sipedon,Northern Water Snake,Reptilia,29306


#### small preprocessing

In [9]:
df_all.rename(columns={'user_id':'photographer', 'url':'observation_url','scientific_name':'species_word'},inplace=True)
df_all.rename(columns={'image_url':'url'},inplace=True)

In [10]:
#make one good date
df_all['datetaken'] = np.where(df_all['time_observed_at'].isnull(), df_all['observed_on'],df_all['time_observed_at'])
#small verification
#df[['observed_on','time_observed_at','observed_on_string','datetaken']]

In [11]:
#potential new synonyms
dico_more_syn = dict(zip(df_all['species_word'], df_all['common_name']))
#verification: dico_more_syn['Malpolon monspessulanus']
#ask andrew if this might help (i..e secure thing from inaturalist of given by crowd)
pickle.dump(dico_more_syn, open(os.path.join(path_data,'dico_more_syn_common_name_inaturalist.pkl'), 'wb'))

In [12]:
#removing if not in our species-word list
df = df_all[df_all['species_word'].isin(li_species_word)]
print(df.shape)
#removing if no url
df = df[~df['url'].isnull()]
print(df.shape)
#add associated species name of the species word 
df['species'] = df['species_word'].map(lambda x: dico_speciesword_species[x])
df.head(3)

(113969, 36)
(113967, 36)


Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,out_of_range,photographer,user_login,created_at,updated_at,...,coordinates_obscured,positioning_method,positioning_device,species_guess,species_word,common_name,iconic_taxon_name,taxon_id,datetaken,species
1,369,5/26/07,2007-05-26,NaT,Eastern Time (US & Canada),,72,stellaoleary,2008-06-05 14:04:53 -0500,2014-10-19 22:06:17 -0500,...,False,,,black rat snake,Pantherophis alleghaniensis,Eastern Rat Snake,Reptilia,59644,2007-05-26 00:00:00,Pantherophis alleghaniensis
5,1227,"May 20, 2006 15:49",2006-05-20,2006-05-20 22:49:00,Pacific Time (US & Canada),False,1,kueda,2009-01-13 02:29:54 -0600,2016-03-01 11:27:39 -0600,...,False,,,Western Terrestrial Garter Snake (Thamnophis e...,Thamnophis elegans,Western Terrestrial Garter Snake,Reptilia,28398,2006-05-20 22:49:00,Thamnophis elegans
6,1228,"May 13, 2006 19:36",2006-05-13,2006-05-14 02:36:00,Pacific Time (US & Canada),False,1,kueda,2009-01-13 02:29:54 -0600,2017-05-02 13:03:37 -0500,...,False,,,Western Terrestrial Garter Snake,Thamnophis elegans,Western Terrestrial Garter Snake,Reptilia,28398,2006-05-14 02:36:00,Thamnophis elegans


In [13]:
df['saved_img_id'] = df.apply(lambda x: 'inaturalist_'+x['species']+'_'+str(x['id'])+'.png', axis=1)
df['img_path'] = df.apply(lambda x: os.path.join(path_data, x['species'], x['saved_img_id']), axis=1)

In [14]:
li_species_flickr = list(set(df['species'].tolist()))
print('There is %d species-name with at least one image in Inaturalist'%len(li_species_flickr))
print('There is %d images to download from Inaturalist'%df.shape[0])
df.head(2)

There is 1591 species-name with at least one image in Inaturalist
There is 113967 images to download from Inaturalist


Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,out_of_range,photographer,user_login,created_at,updated_at,...,positioning_device,species_guess,species_word,common_name,iconic_taxon_name,taxon_id,datetaken,species,saved_img_id,img_path
1,369,5/26/07,2007-05-26,NaT,Eastern Time (US & Canada),,72,stellaoleary,2008-06-05 14:04:53 -0500,2014-10-19 22:06:17 -0500,...,,black rat snake,Pantherophis alleghaniensis,Eastern Rat Snake,Reptilia,59644,2007-05-26 00:00:00,Pantherophis alleghaniensis,inaturalist_Pantherophis alleghaniensis_369.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
5,1227,"May 20, 2006 15:49",2006-05-20,2006-05-20 22:49:00,Pacific Time (US & Canada),False,1,kueda,2009-01-13 02:29:54 -0600,2016-03-01 11:27:39 -0600,...,,Western Terrestrial Garter Snake (Thamnophis e...,Thamnophis elegans,Western Terrestrial Garter Snake,Reptilia,28398,2006-05-20 22:49:00,Thamnophis elegans,inaturalist_Thamnophis elegans_1227.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


In [15]:
if len(df['id'].tolist())!=len(set(df['id'].tolist())):
    print('ERROR: not unique ids of Inaturalist images')
    sys.exit()

# Download Inaturalist images

#in case to erase all (not csv data given by Andrew)
for species in glob.glob(os.path.join(path_data,'*')):
    if not species.endswith('.csv'):
        shutil.rmtree(species)

In [16]:
#create appropriate folders if not existing
for species in li_species_flickr:
    folder_path_s = os.path.join(path_data,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)      

In [17]:
def download_image(i):
    #download image and save it
    species = df.iloc[i]['species']
    f = os.path.join(path_data, species,'inaturalist_'+species+'_'+str(df.iloc[i]['id'])+".png")
    get_image(url=df.iloc[i]['url'], path=f, name=df.iloc[i]['species_word'])

In [18]:
#compute time needed to gather all images
start = time.time()
with Pool(50) as p:
    p.map(download_image, range(df.shape[0]))
end = time.time()
print ("Total running time: ", (end-start)/60) #in minutes

Not able to SAVE image for species Thamnophis hammondii and url http://sphotos-b.xx.fbcdn.net/hphotos-ash4/196550_4330579617765_564575578_n.jpg, lets STOP due to: 
 HTTP Error 400: Bad Request
Not able to SAVE image for species Crotalus armstrongi and url https://fbcdn-sphotos-g-a.akamaihd.net/hphotos-ak-xpa1/t1.0-9/p600x600/10625131_4626806324681_5460785350666786844_n.jpg, lets STOP due to: 
 HTTP Error 400: Bad Request
Not able to SAVE image for species Pituophis deppei and url https://scontent-b.xx.fbcdn.net/hphotos-xpa1/t1.0-9/p600x600/10417681_10203808709836444_675001840006000879_n.jpg, lets STOP due to: 
 HTTP Error 403: Access Denied
Not able to SAVE image for species Lampropeltis californiae and url https://fbcdn-sphotos-d-a.akamaihd.net/hphotos-ak-xfa1/v/t1.0-9/p480x480/164955_10151647182584612_1173872859_n.jpg?oh=a6e9fd5e20f5b321494826940314525c&oe=54FBF18B&__gda__=1425672637_ce899e66de7ff08d173d8af6a2fde102, lets STOP due to: 
 HTTP Error 400: Bad Request
Not able to SAVE im

 HTTP Error 400: Bad Request
Not able to SAVE image for species Crotalus molossus and url https://scontent-b.xx.fbcdn.net/hphotos-xpa1/v/t1.0-9/p480x480/226792_168611039864942_4004217_n.jpg?oh=a0df7bb89c4ce60cfba7bec4fc81235b&oe=550DE1D2, lets STOP due to: 
 HTTP Error 403: Forbidden
Not able to SAVE image for species Contia tenuis and url https://scontent-b.xx.fbcdn.net/hphotos-frc3/t1.0-9/p480x480/392691_238152756252468_1171277407_n.jpg, lets STOP due to: 
 HTTP Error 403: Access Denied
Not able to SAVE image for species Thamnophis elegans and url https://sphotos-a.xx.fbcdn.net/hphotos-snc7/2493_1061593381704_6463937_n.jpg, lets STOP due to: 
 HTTP Error 400: Bad Request
Not able to SAVE image for species Thamnophis proximus and url https://sphotos-a.xx.fbcdn.net/hphotos-snc6/269265_4385827856466_1008832304_n.jpg, lets STOP due to: 
 HTTP Error 400: Bad Request
Not able to SAVE image for species Natrix natrix and url https://fbcdn-photos-a.akamaihd.net/hphotos-ak-ash2/36344_148710634

# Check amount of inaturalist images we have

In [19]:
#choose appropriate genus for search
li_images_in = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_images_in.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')]) 
print('We have %d images collected from inaturalist'%len(li_images_in))
#we gain 5'000 with with languages: 108'712 to 114'584

We have 113892 images collected from inaturalist


# save metadata info (only collected images)

In [20]:
#in case, remove images in several species (should not happen in inaturalist)
li_test = []
for i in li_images_in:
    li_test.append(i.split('/')[-1].split('_')[-1].split('.')[0])
c = Counter(li_test)
c = {k:v for k,v in c.items() if v>1}
id_to_be_removed = list(set(c.keys()))
if len(id_to_be_removed)>0:
    print('ERROR: image sin different species!! probably 1 species-word was associated to several species over time')
    sys.exit()
#delete them after analysing why
#for i in li_images_in:
#    if i.split('/')[-1].split('_')[-1].split('.')[0] in id_to_be_removed:
#        #delete file
#        os.remove(i)

"for i in li_images_in:\n    if i.split('/')[-1].split('_')[-1].split('.')[0] in id_to_be_removed:\n        #delete file\n        os.remove(i)"

In [21]:
x1 = df.shape[0]
df = df[df['img_path'].isin(li_images_in)]
print('We lost %.2f percent of images (%d images) due to propably invalid url'%((x1-len(li_images_in))/x1*100, 
                                                                                x1-len(li_images_in)))
df.to_csv(os.path.join(path_data,'inaturalist_image_info.csv'), index=False, sep=';')

We lost 0.07 percent of images (75 images) due to propably invalid url


In [22]:
print(df.shape)
df.head(3)

(113892, 39)


Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,out_of_range,photographer,user_login,created_at,updated_at,...,positioning_device,species_guess,species_word,common_name,iconic_taxon_name,taxon_id,datetaken,species,saved_img_id,img_path
1,369,5/26/07,2007-05-26,NaT,Eastern Time (US & Canada),,72,stellaoleary,2008-06-05 14:04:53 -0500,2014-10-19 22:06:17 -0500,...,,black rat snake,Pantherophis alleghaniensis,Eastern Rat Snake,Reptilia,59644,2007-05-26 00:00:00,Pantherophis alleghaniensis,inaturalist_Pantherophis alleghaniensis_369.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
5,1227,"May 20, 2006 15:49",2006-05-20,2006-05-20 22:49:00,Pacific Time (US & Canada),False,1,kueda,2009-01-13 02:29:54 -0600,2016-03-01 11:27:39 -0600,...,,Western Terrestrial Garter Snake (Thamnophis e...,Thamnophis elegans,Western Terrestrial Garter Snake,Reptilia,28398,2006-05-20 22:49:00,Thamnophis elegans,inaturalist_Thamnophis elegans_1227.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
6,1228,"May 13, 2006 19:36",2006-05-13,2006-05-14 02:36:00,Pacific Time (US & Canada),False,1,kueda,2009-01-13 02:29:54 -0600,2017-05-02 13:03:37 -0500,...,,Western Terrestrial Garter Snake,Thamnophis elegans,Western Terrestrial Garter Snake,Reptilia,28398,2006-05-14 02:36:00,Thamnophis elegans,inaturalist_Thamnophis elegans_1228.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


In [23]:
li_test

['10673210',
 '1077169',
 '10861777',
 '1111226',
 '13138004',
 '13176756',
 '13556005',
 '13589641',
 '14420377',
 '14891940',
 '15085550',
 '16695788',
 '16696826',
 '16788246',
 '16983693',
 '17094215',
 '17101063',
 '17465323',
 '17467494',
 '2757765',
 '3514069',
 '3781909',
 '3952185',
 '4971689',
 '6194545',
 '7223641',
 '9539421',
 '12962117',
 '15631414',
 '52915',
 '71076',
 '16514067',
 '16559647',
 '2393053',
 '2884364',
 '29586',
 '3534447',
 '3534472',
 '4278906',
 '6913961',
 '7718647',
 '7718648',
 '16237111',
 '16384752',
 '16384842',
 '16385107',
 '16385129',
 '16526300',
 '16622260',
 '16622286',
 '16622295',
 '16622344',
 '16622395',
 '17066180',
 '17066217',
 '17066337',
 '17100296',
 '17100618',
 '17100831',
 '17100861',
 '17100983',
 '17101050',
 '17315801',
 '17318432',
 '17318469',
 '17680008',
 '17680140',
 '2661137',
 '6667869',
 '5212890',
 '5418489',
 '5583241',
 '57029',
 '5882865',
 '5936194',
 '5936428',
 '6038219',
 '6222371',
 '6370418',
 '6439491',
 '