In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
import time
import pickle
from collections import Counter


#to set connection with Flickr API
from flickrapi import FlickrAPI

#image
from PIL import Image

#url open to get image
import urllib.request
from urllib.request import urlopen

#date
import datetime as dt
from datetime import datetime

#plot
import cv2
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
PACKAGE_PARENT = '../../..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from UTILS.utils import get_image

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_data = os.path.join(ROOT_DIR,'datasets/flickr')

# API parameters

In [4]:
#to get these data follow the direction of: http://joequery.me/code/flickr-api-image-search-python/
FLICKR_PUBLIC = 'dd0cb0ced4e83452f8d49cb3d534707d'
FLICKR_SECRET = '4c568d2002b5506e'

In [5]:
flickr = FlickrAPI(FLICKR_PUBLIC, FLICKR_SECRET, format='parsed-json')
extras = 'description,geo,tags,url_c,owner_name,date_taken,license'

In [6]:
#create appropriate folder if not existing
if not os.path.exists(path_data):
    os.makedirs(path_data)

# download species and their synonymes

In [28]:
#dowload dictionary owith keys=species, value=list of its synonyms
dico_syn = pickle.load(open(os.path.join(ROOT_DIR,'datasets','synonyms','dico_species_lisyn.pkl'), 'rb'))

In [31]:
#in case to erase all
#shutil.rmtree(os.path.join(path_data))

In [32]:
#INFO:
#for more parameter options: https://www.flickr.com/services/api/flickr.photos.search.html
#tags (Optional): A comma-delimited list of tags. Photos with one or more (or all tags by changing tags_mode)of the 
#tags listed will be returned. You can exclude results that match a term by prepending it with a - character.
#http://joequery.me/code/flickr-api-image-search-python/
#lisence info: https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html
#geolocalisation should not be used, as for example picture might be taken from a museum. we should add geolocalisation 
#based on 'biology' knowledge

In [29]:
#create a list of species
li_species = list(dico_syn.keys())
if len(li_species)!=len(set(li_species)):
    print('EREUR non unique species name')
    sys.exit()
print('There is %d species'%len(li_species))

There is 3730 species


In [30]:
#create appropriate folder if needed
for species in li_species:
    folder_path_s = os.path.join(path_data,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)

# Download image from flickr

In [31]:
#idea: collect all the image from the begining date, and until no more new image are outcome. In this way one can 
#rerun at anytime to grab only the new images

In [13]:
#choose starting date and we will take species that was not taken at this starting date
date = '30_11_2018' #in string otherwise might change if we run over two days
f = os.path.join(path_data,'li_species_done_'+date+'.pkl')
if len(glob.glob(f))>0:
    li_species_done = pickle.load(open(f, 'rb'))
else:
    li_species_done = []
print('%.2f percent of the species were already requested until date %s'%(len(li_species_done)/len(li_species)*100,
                                                                        date))

#download images from flickr
#Go in each species folder, and downlaod all the photos with a taken date greater than or equal to the maximum one 
#recorded in the species-metadata file if it exist, otherwise download it from the begining ("0000-00-00 00:00:00")
#While downloading an image, if there is an error from flickr stop the code (might be connection error). Then you 
#simply need to rerun it perhaps few minutes later
li_species_to_do = [x for x in li_species if x not in li_species_done[0:-1]]
print('Hence, we have %d species left to query for'%len(li_species_to_do))

for species in tqdm.tqdm(li_species_to_do):
    
    #save all previous species as done until that specific date
    li_species_done.append(species)
    pickle.dump(li_species_done, open(os.path.join(path_data,'li_species_done_'+date+'.pkl'), 'wb'))
    
    #list of synonyms for the species
    #li_syn = eval(df_species_syn[df_species_syn['binomial']==species]['li_synonyms_final'].values[0]) + [species]
    li_syn = dico_syn[species]
    
    #initialize folder path for this species
    folder_path_s = os.path.join(path_data, species)
    
    #iterate through each species synonyms
    for species_word in li_syn:
        
        #make sur li_syn is a list
        if len(species_word)==1:
            print('ERROR:',species_word, species)
            sys.exit()
            
        #initialization
        t = "0000-00-00 00:00:00"
        df_old = pd.DataFrame()
    
        #if the new collection of images is empty, then stop it, otherwise continue with the last taken date
        while True:

            #open the existing metadata file if any, and use the max taken date to grab data from that point instead
            old_meta_data_file = os.path.join(folder_path_s,'flickr_df_'+species+'.csv')
            if len(glob.glob(old_meta_data_file))>0:
                #date will be: Timestamp('2017-04-30 17:12:52')
                df_old = pd.read_csv(old_meta_data_file, parse_dates=['datetaken'], index_col=False, sep=';')
                #we need to take minus one day as min_taken_date is apparently working at day level, and before saving
                #we'll need to remove possibly duplicates (might happen if several picture taken the same day but we stop
                #at a "middle picture of the day"). Also we convert to good format for flickr query
                df_ = df_old[df_old['species_word']==species_word].copy()
                if df_.shape[0]>0:
                    t = (max(df_['datetaken'].tolist()) - dt.timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
                    #print('we will use as starting date %s'%str(t))
            
            #take at most 5 times if their is a connection error connection error )
            k = 0
            while k<5:
                try:
                    image_data = flickr.photos.search(text='\"'+species_word+'\"', content_type=1, media="photos", 
                                                      per_page=500, extras=extras, min_taken_date=t)
                    k = 5
                except KeyboardInterrupt:
                    raise
                except Exception as e:
                    k = k+1
                    print('Not able to DOWNLOAD flickr img for species %s, due to error: %s, lets SLEEP'%(species,e))
                    # sleep for 5 seconds
                    image_data=None
                    time.sleep(5)
                    
            if image_data==None:
                #print('image is none get out of loop')
                break
                
            #download image if its a new one
            for i, photo in enumerate(image_data['photos']['photo']): #besides photos there is only a 'stat' key
                if 'url_c' in photo:
                    url = photo['url_c']
                    f = os.path.join(folder_path_s,'flickr_'+species+'_'+photo['id']+".png")
                    get_image(url=url, path=f, name=species)

            #create new metadata file with all the images (old and new)
            df_new = pd.DataFrame(image_data['photos']['photo'])
            df_new['species_word'] = species_word
            df = pd.concat([df_old, df_new], ignore_index=True)

            #save and remove duplicates (before: uniform the id type(as when we save and open the str get converted 
            #to int))
            if df.shape[0]>0:
                df['id'] = df['id'].astype(int) 
                #drop duplicates due to dates that must overlap when re-query data for the second time
                #we keep trace of each image evn if its already find for another syn, in this way we would directly know
                #which image respond ti which species-word, and also which last-taken date correspond to which species
                df = df.drop_duplicates(subset=['id', 'species_word'], keep='first', inplace=False)
                #save metadata for each images of this species (note: might be empty if no images was collected)
                df.to_csv(os.path.join(folder_path_s,'flickr_df_'+species+'.csv'), index=False, sep=';')

            #print(df_old.shape,df_new.shape,df.shape) #to debug
            #if there was already data collected and the new one brought some more data (not tru now that we have several
            #names per species)
            #if (df_old.shape[0]>0) and (df.shape[0]>df_old.shape[0]):
            #    print('species %s needed two collected data'%species)
            #if no more data was bring last time
            if df.shape[0]==df_old.shape[0]: #wrong: df_new.shape[0]==0: indeed we can gather images that are already (-1d)
                del image_data
                break
                

  0%|          | 0/1619 [00:00<?, ?it/s]

7.97 percent of the species were already requested until date 29_11_2018
Hence, we have 1619 species left to query for


 12%|█▏        | 199/1619 [1:03:25<7:32:38, 19.13s/it]do_request: Status code 502 received, content:
    <html>
<head><title>502 Bad Gateway</title></head>
<body bgcolor="white">
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx/1.7.6</center>
</body>
</html>



Not able to DOWNLOAD flickr img for species Chironius multiventris, due to error: do_request: Status code 502 received, lets SLEEP


 15%|█▍        | 235/1619 [1:16:36<7:31:09, 19.56s/it]do_request: Status code 500 received, content:
    


Not able to DOWNLOAD flickr img for species Contia tenuis, due to error: do_request: Status code 500 received, lets SLEEP


 15%|█▍        | 238/1619 [1:18:28<7:35:18, 19.78s/it]do_request: Status code 502 received, content:
    <!DOCTYPE html>
<html lang="en-us"><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
    <meta charset="utf-8">
    <title>Yahoo</title>
    <meta name="viewport" content="width=device-width,initial-scale=1,minimal-ui">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <style>
html {
    height: 100%;
}
body {
    background: #fafafc url(https://s.yimg.com/nn/img/sad-panda-201402200631.png) 50% 50%;
    background-size: cover;
    height: 100%;
    text-align: center;
    font: 300 18px "helvetica neue", helvetica, verdana, tahoma, arial, sans-serif;
}
table {
    height: 100%;
    width: 100%;
    table-layout: fixed;
    border-collapse: collapse;
    border-spacing: 0;
    border: none;
}
h1 {
    font-size: 42px;
    font-weight: 400;
    color: #400090;
}
p {
    color: #1A1A1A;
}
#message-1 {
    font-weight: bold;
    margin: 0;
}

Not able to DOWNLOAD flickr img for species Corallus cookii, due to error: do_request: Status code 502 received, lets SLEEP


 17%|█▋        | 274/1619 [1:40:03<8:11:08, 21.91s/it]

Not able to DOWNLOAD flickr img for species Crotaphopeltis hippocrepis, due to error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)), lets SLEEP


 25%|██▍       | 401/1619 [2:23:34<7:16:04, 21.48s/it]

Not able to DOWNLOAD flickr img for species Epicrates cenchria, due to error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)), lets SLEEP


 26%|██▋       | 427/1619 [2:33:06<7:07:23, 21.51s/it]do_request: Status code 502 received, content:
    <html>
<head><title>502 Bad Gateway</title></head>
<body bgcolor="white">
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx/1.7.6</center>
</body>
</html>



Not able to DOWNLOAD flickr img for species Eunectes notaeus, due to error: do_request: Status code 502 received, lets SLEEP


 32%|███▏      | 519/1619 [3:03:46<6:29:29, 21.25s/it]do_request: Status code 502 received, content:
    <!DOCTYPE html>
<html lang="en-us"><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
    <meta charset="utf-8">
    <title>Yahoo</title>
    <meta name="viewport" content="width=device-width,initial-scale=1,minimal-ui">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <style>
html {
    height: 100%;
}
body {
    background: #fafafc url(https://s.yimg.com/nn/img/sad-panda-201402200631.png) 50% 50%;
    background-size: cover;
    height: 100%;
    text-align: center;
    font: 300 18px "helvetica neue", helvetica, verdana, tahoma, arial, sans-serif;
}
table {
    height: 100%;
    width: 100%;
    table-layout: fixed;
    border-collapse: collapse;
    border-spacing: 0;
    border: none;
}
h1 {
    font-size: 42px;
    font-weight: 400;
    color: #400090;
}
p {
    color: #1A1A1A;
}
#message-1 {
    font-weight: bold;
    margin: 0;
}

Not able to DOWNLOAD flickr img for species Lampropeltis getula, due to error: do_request: Status code 502 received, lets SLEEP


 43%|████▎     | 695/1619 [3:53:25<5:10:20, 20.15s/it]do_request: Status code 502 received, content:
    <html>
<head><title>502 Bad Gateway</title></head>
<body bgcolor="white">
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx/1.7.6</center>
</body>
</html>



Not able to DOWNLOAD flickr img for species Daboia mauritanica, due to error: do_request: Status code 502 received, lets SLEEP


 62%|██████▏   | 1006/1619 [5:38:18<3:26:09, 20.18s/it]do_request: Status code 502 received, content:
    <html>
<head><title>502 Bad Gateway</title></head>
<body bgcolor="white">
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx/1.7.6</center>
</body>
</html>



Not able to DOWNLOAD flickr img for species Rhadinaea flavilata, due to error: do_request: Status code 502 received, lets SLEEP


 71%|███████   | 1150/1619 [6:22:32<2:36:00, 19.96s/it]

Not able to DOWNLOAD flickr img for species Thamnophis sirtalis, due to error: Unterminated string starting at: line 1 column 231461 (char 231460), lets SLEEP


 89%|████████▉ | 1443/1619 [7:38:22<55:54, 19.06s/it]  do_request: Status code 502 received, content:
    <html>
<head><title>502 Bad Gateway</title></head>
<body bgcolor="white">
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx/1.7.6</center>
</body>
</html>



Not able to DOWNLOAD flickr img for species Trimeresurus insularis, due to error: do_request: Status code 502 received, lets SLEEP


100%|██████████| 1619/1619 [8:26:49<00:00, 18.78s/it]


# Check amount of flickr images

In [4]:
#keeping meta data of only the images we truely have 
#look at the actual image we really have
li_flickr_images = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_flickr_images.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')])
len(li_flickr_images)
print('We have %d images collected from Flickr'%len(li_flickr_images))

We have 46686 images collected from Flickr


## create one csv file with all metadata info from each species

In [5]:
#remove images in several species
li_test = []
for i in li_flickr_images:
    li_test.append(i.split('/')[-1].split('_')[-1].split('.')[0])
c = Counter(li_test)
c = {k:v for k,v in c.items() if v>1}
id_to_be_removed = list(set(c.keys()))
if len(id_to_be_removed)>0:
    print('ERROR: images in different species! probably 1 species-word was associated to several species over time')

li_flickr_images_= li_flickr_images.copy()
for i in li_flickr_images:
    if i.split('/')[-1].split('_')[-1].split('.')[0] in id_to_be_removed:
        #delete file
        os.remove(i)
        #removing also from the list
        li_flickr_images_.remove('i')
li_flickr_images = li_flickr_images_

In [6]:
df_all = pd.DataFrame()
li_df = []
for species in tqdm.tqdm(glob.glob(os.path.join(path_data,'*'))):
    csv_f = glob.glob(os.path.join(species,'*.csv'))
    if len(csv_f)==1:
        df = pd.read_csv(csv_f[0], sep=';', index_col=False)
        df['species'] = species.split('/')[-1]
        li_df.append(df)
        del df
df_all = pd.concat(li_df, ignore_index=True)
print(df_all.shape)
df_all.head(3)

100%|██████████| 3734/3734 [00:23<00:00, 156.71it/s]


(1860111, 31)


Unnamed: 0,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,geo_is_family,geo_is_friend,...,place_id,secret,server,species,species_word,tags,title,url_c,width_c,woeid
0,0.0,0.0,2018-09-21 10:26:53,0.0,0.0,{'_content': ''},5.0,,,,...,,bfa1390f97,4888.0,Vipera seoanei,Pelias seoanei,,Vipera seoanei,https://farm5.staticflickr.com/4888/4584590182...,800.0,
1,0.0,0.0,2018-07-21 18:24:43,0.0,0.0,{'_content': ''},2.0,,,,...,,13793495eb,1902.0,Vipera seoanei,Pelias seoanei,,Vipera seoanei,https://farm2.staticflickr.com/1902/4518115072...,800.0,
2,0.0,0.0,2018-09-08 10:10:51,0.0,0.0,{'_content': ''},2.0,,,,...,,33fcf41e9a,1946.0,Vipera seoanei,Pelias seoanei,,Vipera seoanei,https://farm2.staticflickr.com/1946/3024247907...,800.0,


In [8]:
#add var
df_all['saved_img_id'] = df_all.apply(lambda x: 'flickr_'+x['species']+'_'+str(x['id'])+".png",axis=1)
df_all['img_path'] = df_all.apply(lambda x: os.path.join(path_data,x['species'], x['saved_img_id']), axis=1)
print(df_all.shape)
#keeping only the images we truely have 
df_all = df_all[df_all['img_path'].isin(li_flickr_images)]
print(df_all.shape)
df_all.head(3)

(1860111, 33)
(1333499, 33)


Unnamed: 0,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,geo_is_family,geo_is_friend,...,server,species,species_word,tags,title,url_c,width_c,woeid,saved_img_id,img_path
0,0.0,0.0,2018-09-21 10:26:53,0.0,0.0,{'_content': ''},5.0,,,,...,4888.0,Vipera seoanei,Vipera seoanei,,Vipera seoanei,https://farm5.staticflickr.com/4888/4584590182...,800.0,,flickr_Vipera seoanei_45845901821.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
1,0.0,0.0,2018-07-21 18:24:43,0.0,0.0,{'_content': ''},2.0,,,,...,1902.0,Vipera seoanei,Vipera seoanei,,Vipera seoanei,https://farm2.staticflickr.com/1902/4518115072...,800.0,,flickr_Vipera seoanei_45181150721.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
2,0.0,0.0,2018-09-08 10:10:51,0.0,0.0,{'_content': ''},2.0,,,,...,1946.0,Vipera seoanei,Vipera seoanei,,Vipera seoanei,https://farm2.staticflickr.com/1946/3024247907...,800.0,,flickr_Vipera seoanei_30242479077.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


In [9]:
#one line per id making the species_word columns a list, so that we know which image react to which
#species word (from the same species, otherwise will be already removed)
df_all1 = df_all.groupby('id')['species_word'].agg(lambda x: list(set(x))).reset_index()
df_all1 = df_all1.rename(columns={0:'species_word'})
print(df_all1.shape)
df_all2 = df_all.drop(['species_word'], inplace=False, axis=1).copy()
df_all2 = df_all2.drop_duplicates(subset=['id','species','img_path'], keep='first', inplace=False)
print(df_all2.shape)
df_all_final = pd.merge(df_all1, df_all2, how='outer', on='id')
print(df_all_final.shape)
df_all_final.head(3)

(46686, 2)
(46686, 32)
(46686, 33)


Unnamed: 0,id,species_word,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,...,secret,server,species,tags,title,url_c,width_c,woeid,saved_img_id,img_path
0,6796830748,[Cacophis squamulosus],0.0,0.0,2012-02-29 19:14:05,0.0,0.0,"{'_content': 'Lamington National Park, Qld'}",8.0,,...,3b0faf2da2,7204.0,Cacophis squamulosus,,Cacophis squamulosus (Golden-crowned Snake),https://farm8.staticflickr.com/7204/6796830748...,800.0,,flickr_Cacophis squamulosus_6796830748.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
1,6799927566,[Masticophis bilineatus],16.0,0.0,2011-06-11 21:52:24,0.0,0.0,"{'_content': 'Sonoran Whipsnake, Masticophis b...",8.0,0.0,...,6928029cc3,7207.0,Masticophis bilineatus,,IMG_3510,https://farm8.staticflickr.com/7207/6799927566...,800.0,28747586.0,flickr_Masticophis bilineatus_6799927566.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
2,6800854734,[Natrix maura],0.0,0.0,2012-02-28 16:26:30,0.0,0.0,{'_content': ''},8.0,,...,11343517a8,7177.0,Natrix maura,,Natrix maura,https://farm8.staticflickr.com/7177/6800854734...,800.0,,flickr_Natrix maura_6800854734.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


In [10]:
#species + andrew syn: 45'520
#species + andrew syn + language: 47101 + english: 47396

In [11]:
#see if for each image we have its info in df
d = len(li_flickr_images)-df_all_final.shape[0]
if d!=0:
    print('ERROR: we have %d images that does not appear in the df'%d)
    sys.exit()
#see if needed
#li_= [i for i in li_flickr_images if i not in df_all['img_path'].tolist()]
#len(li_)

#verify that each image appear only once:
if max(list(df_all_final['id'].value_counts().values))!=1:
    print('ERROR: we have images that appear twice in teh df'%d)
    sys.exit()
   

In [12]:
#save metadata info (might not be of same size of number of collected images)
df_all_final.to_csv(os.path.join(path_data,'flickr_image_info.csv'), index=False, sep=';')