In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
from PIL import Image
import pickle
import cv2
import urllib.request

#plot (for image verification)
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_data = os.path.join(ROOT_DIR,'datasets')

In [3]:
# the idea is to produce a dataframe with the following images information: 
#'saved_img_id','id','datetaken','latitude','longitude','photographer','license','url','img_path','species',
#'species_word', 'species_word_source', 'datasource'
li_info = ['saved_img_id','id','datetaken','latitude','longitude','ind_unit','license','url','img_path',
           'species','species_word', 'datasource']

# Download data

## flickr

In [27]:
#save metadata info (might not be of same size of number of collected images)
df_flickr = pd.read_csv(os.path.join(path_data,'flickr','flickr_image_info.csv'),sep=';',header=0,index_col=False)
print(df_flickr.shape)
df_flickr.head(3)

(46686, 33)


Unnamed: 0,id,species_word,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,...,secret,server,species,tags,title,url_c,width_c,woeid,saved_img_id,img_path
0,6796830748,['Cacophis squamulosus'],0.0,0.0,2012-02-29 19:14:05,0.0,0.0,"{'_content': 'Lamington National Park, Qld'}",8.0,,...,3b0faf2da2,7204.0,Cacophis squamulosus,,Cacophis squamulosus (Golden-crowned Snake),https://farm8.staticflickr.com/7204/6796830748...,800.0,,flickr_Cacophis squamulosus_6796830748.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
1,6799927566,['Masticophis bilineatus'],16.0,0.0,2011-06-11 21:52:24,0.0,0.0,"{'_content': 'Sonoran Whipsnake, Masticophis b...",8.0,0.0,...,6928029cc3,7207.0,Masticophis bilineatus,,IMG_3510,https://farm8.staticflickr.com/7207/6799927566...,800.0,28747586.0,flickr_Masticophis bilineatus_6799927566.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
2,6800854734,['Natrix maura'],0.0,0.0,2012-02-28 16:26:30,0.0,0.0,{'_content': ''},8.0,,...,11343517a8,7177.0,Natrix maura,,Natrix maura,https://farm8.staticflickr.com/7177/6800854734...,800.0,,flickr_Natrix maura_6800854734.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


### remove potentially bad images

In [28]:
#TODO: with tags: place, landscape eggs etc
#df_flickr['has_egg'] = df_flickr['description'].map(lambda x: 'eggs' in x)

In [29]:
#wrong! this is a snake!
#Image.open( urllib.request.urlopen(url))

### preprocessing

In [30]:
#add independant unit variable (not using the date as its better not to trust it... qulity seemed quite bad)
df_flickr['ind_unit'] = df_flickr.apply(lambda x: 'flickr_'+x['owner']+'_'+x['species'], axis=1)
df_flickr['datasource'] = 'flickr'
df_flickr = df_flickr.rename(columns={'url_c':'url'})
df_flickr = df_flickr.filter(li_info)

In [31]:
print(df_flickr.shape)
df_flickr.head(3)

(46686, 12)


Unnamed: 0,saved_img_id,id,datetaken,latitude,longitude,ind_unit,license,url,img_path,species,species_word,datasource
0,flickr_Cacophis squamulosus_6796830748.png,6796830748,2012-02-29 19:14:05,0.0,0.0,flickr_37993248@N06_Cacophis squamulosus,0.0,https://farm8.staticflickr.com/7204/6796830748...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Cacophis squamulosus,['Cacophis squamulosus'],flickr
1,flickr_Masticophis bilineatus_6799927566.png,6799927566,2011-06-11 21:52:24,34.185333,-112.134334,flickr_33102730@N02_Masticophis bilineatus,0.0,https://farm8.staticflickr.com/7207/6799927566...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Masticophis bilineatus,['Masticophis bilineatus'],flickr
2,flickr_Natrix maura_6800854734.png,6800854734,2012-02-28 16:26:30,0.0,0.0,flickr_74385542@N04_Natrix maura,0.0,https://farm8.staticflickr.com/7177/6800854734...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Natrix maura,['Natrix maura'],flickr


## herpmapper

In [32]:
#save metadata info (might not be of same size of number of collected images)
df_hm = pd.read_csv(os.path.join(path_data,'herpmapper','herpmapper_image_info.csv'),sep=';')
print(df_hm.shape)
#missing: latitude, longitude, license
df_hm.head(3)

(59475, 14)


Unnamed: 0,Date,Level 1,Level 2,Time,Country,species_word,ID,species,level_8,url,id,saved_img_id,img_path,datetaken
0,2006-11-08,Iowa,Johnson,13:42:00,United States of America,Storeria dekayi,HM 1,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-1,herpmapper_Storeria dekayi_0-1.png,/home/camille/vm_exchange/Lab/snakes/datasets/...,2006-11-08 13:42:00
1,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,0,https://www.herpmapper.org/content/voucher/0/1...,0-13,herpmapper_Storeria dekayi_0-13.png,/home/camille/vm_exchange/Lab/snakes/datasets/...,2006-11-04 12:55:00
2,2006-11-04,Iowa,Linn,12:55:00,United States of America,Storeria dekayi,HM 5,Storeria dekayi,1,https://www.herpmapper.org/content/voucher/0/...,0-14,herpmapper_Storeria dekayi_0-14.png,/home/camille/vm_exchange/Lab/snakes/datasets/...,2006-11-04 12:55:00


#### preprocessing

In [33]:
df_hm['ind_unit'] = df_hm.apply(lambda x: 'herpmapper_'+x['ID']+'_'+x['species'], axis=1)
df_hm['datasource'] = 'herpmapper'

In [34]:
#latitude, longitude not in herpmapper
df_hm = df_hm.filter(li_info)

In [35]:
df_hm.head(3)

Unnamed: 0,saved_img_id,id,datetaken,ind_unit,url,img_path,species,species_word,datasource
0,herpmapper_Storeria dekayi_0-1.png,0-1,2006-11-08 13:42:00,herpmapper_HM 1_Storeria dekayi,https://www.herpmapper.org/content/voucher/0/1...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Storeria dekayi,Storeria dekayi,herpmapper
1,herpmapper_Storeria dekayi_0-13.png,0-13,2006-11-04 12:55:00,herpmapper_HM 5_Storeria dekayi,https://www.herpmapper.org/content/voucher/0/1...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Storeria dekayi,Storeria dekayi,herpmapper
2,herpmapper_Storeria dekayi_0-14.png,0-14,2006-11-04 12:55:00,herpmapper_HM 5_Storeria dekayi,https://www.herpmapper.org/content/voucher/0/...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Storeria dekayi,Storeria dekayi,herpmapper


## inaturalist

In [16]:
#save metadata info (might not be of same size of number of collected images)
df_in = pd.read_csv(os.path.join(path_data,'inaturalist','inaturalist_image_info.csv'), sep=';')
print(df_in.shape)
df_in.head(3)

(113892, 39)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,out_of_range,photographer,user_login,created_at,updated_at,...,positioning_device,species_guess,species_word,common_name,iconic_taxon_name,taxon_id,datetaken,species,saved_img_id,img_path
0,369,5/26/07,2007-05-26 00:00:00,,Eastern Time (US & Canada),,72,stellaoleary,2008-06-05 14:04:53 -0500,2014-10-19 22:06:17 -0500,...,,black rat snake,Pantherophis alleghaniensis,Eastern Rat Snake,Reptilia,59644,2007-05-26 00:00:00,Pantherophis alleghaniensis,inaturalist_Pantherophis alleghaniensis_369.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
1,1227,"May 20, 2006 15:49",2006-05-20 00:00:00,2006-05-20 22:49:00,Pacific Time (US & Canada),False,1,kueda,2009-01-13 02:29:54 -0600,2016-03-01 11:27:39 -0600,...,,Western Terrestrial Garter Snake (Thamnophis e...,Thamnophis elegans,Western Terrestrial Garter Snake,Reptilia,28398,2006-05-20 22:49:00,Thamnophis elegans,inaturalist_Thamnophis elegans_1227.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
2,1228,"May 13, 2006 19:36",2006-05-13 00:00:00,2006-05-14 02:36:00,Pacific Time (US & Canada),False,1,kueda,2009-01-13 02:29:54 -0600,2017-05-02 13:03:37 -0500,...,,Western Terrestrial Garter Snake,Thamnophis elegans,Western Terrestrial Garter Snake,Reptilia,28398,2006-05-14 02:36:00,Thamnophis elegans,inaturalist_Thamnophis elegans_1228.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


#### preprocessing

In [25]:
df_in['ind_unit'] = df_in.apply(lambda x: 'inaturalist_'+str(x['photographer'])+'_'+x['species'], axis=1)
df_in['datasource'] = 'inaturalist'
df_in = df_in.filter(li_info)

In [26]:
print(df_in.shape)
df_in.head(3)

(113892, 12)


Unnamed: 0,saved_img_id,id,datetaken,latitude,longitude,ind_unit,license,url,img_path,species,species_word,datasource
0,inaturalist_Pantherophis alleghaniensis_369.png,369,2007-05-26 00:00:00,41.316601,-72.558899,inaturalist_72_Pantherophis alleghaniensis,,https://static.inaturalist.org/photos/401/medi...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Pantherophis alleghaniensis,Pantherophis alleghaniensis,inaturalist
1,inaturalist_Thamnophis elegans_1227.png,1227,2006-05-20 22:49:00,37.614403,-122.488457,inaturalist_1_Thamnophis elegans,CC0,https://static.inaturalist.org/photos/1349/med...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Thamnophis elegans,Thamnophis elegans,inaturalist
2,inaturalist_Thamnophis elegans_1228.png,1228,2006-05-14 02:36:00,37.664253,-122.055359,inaturalist_1_Thamnophis elegans,CC0,https://static.inaturalist.org/photos/1350/med...,/home/camille/vm_exchange/Lab/snakes/datasets/...,Thamnophis elegans,Thamnophis elegans,inaturalist


### SNAPP - Andrew images

# Aggregate

# Add common info

In [None]:
#TODO: species_word_source

#keeping meta data of only the images we truely have 
#look at the actual image we really have
li_flickr_images = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_flickr_images.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')])
len(li_flickr_images)
print('We have %d images collected from Flickr'%len(li_flickr_images))