In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
import time
import pickle
from collections import Counter

#to set connection with Flickr API
from flickrapi import FlickrAPI

#image
from PIL import Image

#url open to get image
import urllib.request
from urllib.request import urlopen

#date
import datetime as dt
from datetime import datetime

#plot
import cv2
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
PACKAGE_PARENT = '../../..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from UTILS.utils import get_image

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_data = os.path.join(ROOT_DIR,'datasets/flickr')

In [4]:
#license info: https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html
#license:0 = 'All rights reserved'

# API parameters

In [5]:
#to get these data follow the direction of: http://joequery.me/code/flickr-api-image-search-python/
FLICKR_PUBLIC = 'dd0cb0ced4e83452f8d49cb3d534707d'
FLICKR_SECRET = '4c568d2002b5506e'

In [6]:
flickr = FlickrAPI(FLICKR_PUBLIC, FLICKR_SECRET, format='parsed-json')
extras = 'description,geo,tags,url_c,owner_name,date_taken,license'

In [7]:
#create appropriate folder if not existing
if not os.path.exists(path_data):
    os.makedirs(path_data)

# download species and their synonymes

In [7]:
#dowload dictionary owith keys=species, value=list of its synonyms
dico_syn = pickle.load(open(os.path.join(ROOT_DIR,'datasets','synonyms','dico_species_lisyn.pkl'), 'rb'))

In [8]:
#in case to erase all
#shutil.rmtree(os.path.join(path_data))

In [9]:
#INFO:
#for more parameter options: https://www.flickr.com/services/api/flickr.photos.search.html
#tags (Optional): A comma-delimited list of tags. Photos with one or more (or all tags by changing tags_mode)of the 
#tags listed will be returned. You can exclude results that match a term by prepending it with a - character.
#http://joequery.me/code/flickr-api-image-search-python/
#lisence info: https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html
#geolocalisation should not be used, as for example picture might be taken from a museum. we should add geolocalisation 
#based on 'biology' knowledge

In [10]:
#create a list of species
li_species = list(dico_syn.keys())
if len(li_species)!=len(set(li_species)):
    print('EREUR non unique species name')
    sys.exit()
print('There is %d species'%len(li_species))

There is 3730 species


In [11]:
#create appropriate folder if needed
for species in li_species:
    folder_path_s = os.path.join(path_data,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)

# Download image from flickr

In [12]:
#idea: collect all the image from the begining date, and until no more new image are outcome. In this way one can 
#rerun at anytime to grab only the new images

In [13]:
#choose starting date and we will take species that was not taken at this starting date
date = '01_12_2018' #in string otherwise might change if we run over two days
f = os.path.join(path_data,'li_species_done_'+date+'.pkl')
if len(glob.glob(f))>0:
    li_species_done = pickle.load(open(f, 'rb'))
else:
    li_species_done = []
print('%.2f percent of the species were already requested until date %s'%(len(li_species_done)/len(li_species)*100,
                                                                        date))

#download images from flickr
#Go in each species folder, and downlaod all the photos with a taken date greater than or equal to the maximum one 
#recorded in the species-metadata file if it exist, otherwise download it from the begining ("0000-00-00 00:00:00")
#While downloading an image, if there is an error from flickr stop the code (might be connection error). Then you 
#simply need to rerun it perhaps few minutes later
li_species_to_do = [x for x in li_species if x not in li_species_done[0:-1]]
print('Hence, we have %d species left to query for'%len(li_species_to_do))

for species in tqdm.tqdm(li_species_to_do):
    
    #save all previous species as done until that specific date
    li_species_done.append(species)
    pickle.dump(li_species_done, open(os.path.join(path_data,'li_species_done_'+date+'.pkl'), 'wb'))
    
    #list of synonyms for the species
    #li_syn = eval(df_species_syn[df_species_syn['binomial']==species]['li_synonyms_final'].values[0]) + [species]
    li_syn = dico_syn[species] + [species]
    
    #initialize folder path for this species
    folder_path_s = os.path.join(path_data, species)
    
    #iterate through each species synonyms
    for species_word in li_syn:
        
        #make sur li_syn is a list
        if len(species_word)==1:
            print('ERROR:',species_word, species)
            sys.exit()
            
        #initialization
        t = "0000-00-00 00:00:00"
        df_old = pd.DataFrame()
    
        #if the new collection of images is empty, then stop it, otherwise continue with the last taken date
        while True:

            #open the existing metadata file if any, and use the max taken date to grab data from that point instead
            old_meta_data_file = os.path.join(folder_path_s,'flickr_df_'+species+'.csv')
            if len(glob.glob(old_meta_data_file))>0:
                #date will be: Timestamp('2017-04-30 17:12:52')
                df_old = pd.read_csv(old_meta_data_file, parse_dates=['datetaken'], index_col=False, sep=';')
                
                #we need to take minus one day as min_taken_date is apparently working at day level, and before saving
                #we'll need to remove possibly duplicates (might happen if several picture taken the same day but we stop
                #at a "middle picture of the day"). Also we convert to good format for flickr query
                df_ = df_old[df_old['species_word']==species_word].copy()
                if df_.shape[0]>0:
                    t = (max(df_['datetaken'].tolist()) - dt.timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
                    #print('we will use as starting date %s'%str(t))
            
            #take at most 5 times if their is a connection error connection error )
            k = 0
            while k<5:
                try:
                    image_data = flickr.photos.search(text='\"'+species_word+'\"', content_type=1, media="photos", 
                                                      per_page=500, extras=extras, min_taken_date=t)
                    k = 5
                except KeyboardInterrupt:
                    raise
                except Exception as e:
                    k = k+1
                    print('Not able to DOWNLOAD flickr img for species %s, due to error: %s, lets SLEEP'%(species,e))
                    # sleep for 5 seconds
                    if k==4:
                        sys.exit()
                    image_data=None
                    time.sleep(5)
                    
            if image_data==None:
                #print('image is none get out of loop')
                break
                
            #download image if its a new one
            for i, photo in enumerate(image_data['photos']['photo']): #besides photos there is only a 'stat' key
                if 'url_c' in photo:
                    url = photo['url_c']
                    f = os.path.join(folder_path_s,'flickr_'+species+'_'+photo['id']+".png")
                    get_image(url=url, path=f, name=species)

            #create new metadata file with all the images (old and new)
            df_new = pd.DataFrame(image_data['photos']['photo'])
            df_new['species_word'] = species_word
            df = pd.concat([df_old, df_new], ignore_index=True)

            #save and remove duplicates (before: uniform the id type(as when we save and open the str get converted 
            #to int))
            if df.shape[0]>0:
                df['id'] = df['id'].astype(int) 
                #drop duplicates due to dates that must overlap when re-query data for the second time
                #we keep trace of each image evn if its already find for another syn, in this way we would directly know
                #which image respond ti which species-word, and also which last-taken date correspond to which species
                df = df.drop_duplicates(subset=['id', 'species_word'], keep='first', inplace=False)
                #save metadata for each images of this species (note: might be empty if no images was collected)
                df.to_csv(os.path.join(folder_path_s,'flickr_df_'+species+'.csv'), index=False, sep=';')

            #print(df_old.shape,df_new.shape,df.shape) #to debug
            #if there was already data collected and the new one brought some more data (not tru now that we have several
            #names per species)
            #if (df_old.shape[0]>0) and (df.shape[0]>df_old.shape[0]):
            #    print('species %s needed two collected data'%species)
            #if no more data was bring last time
            if df.shape[0]==df_old.shape[0]: #wrong: df_new.shape[0]==0: indeed we can gather images that are already (-1d)
                del image_data
                break
                

  0%|          | 0/730 [00:00<?, ?it/s]

82.65 percent of the species were already requested until date 01_12_2018
Hence, we have 730 species left to query for


100%|██████████| 730/730 [38:20<00:00,  3.15s/it] 


# Check amount of flickr images

In [8]:
#keeping meta data of only the images we truely have 
#look at the actual image we really have
li_flickr_images = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_flickr_images.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')])
len(li_flickr_images)
print('We have %d images collected from Flickr'%len(li_flickr_images))

We have 57828 images collected from Flickr


## create one csv file with all metadata info from each species

In [8]:
#remove images in several species (can happen if an image has two species-word in different species)
li_test = []
for i in li_flickr_images:
    li_test.append(i.split('/')[-1].split('_')[-1].split('.')[0])
c = Counter(li_test)
c = {k:v for k,v in c.items() if v>1}
id_to_be_removed = list(set(c.keys()))
if len(id_to_be_removed)>0:
    print('%d images appears in different species, we will remove them'%len(id_to_be_removed))

li_flickr_images_= li_flickr_images.copy()
for i in li_flickr_images:
    if i.split('/')[-1].split('_')[-1].split('.')[0] in id_to_be_removed:
        #delete file
        os.remove(i)
        #removing also from the list
        li_flickr_images_.remove(i)
li_flickr_images = li_flickr_images_

In [9]:
df_all = pd.DataFrame()
li_df = []
for species in tqdm.tqdm(glob.glob(os.path.join(path_data,'*'))):
    csv_f = glob.glob(os.path.join(species,'*.csv'))
    if len(csv_f)==1:
        df = pd.read_csv(csv_f[0], sep=';', index_col=False)
        df['species'] = species.split('/')[-1]
        li_df.append(df)
        del df
df_all = pd.concat(li_df, ignore_index=True)
print(df_all.shape)
df_all.head(3)

100%|██████████| 3736/3736 [02:01<00:00, 30.75it/s]


(1885146, 31)


Unnamed: 0,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,geo_is_family,geo_is_friend,...,place_id,secret,server,species,species_word,tags,title,url_c,width_c,woeid
0,16.0,0.0,2018-05-16 09:48:21,0.0,0.0,"{'_content': 'Russia: Rostov Oblast, 11.5 km W...",2.0,0.0,0.0,0.0,...,6x9R3.VYUrzZVccn_Q,1fcbb40f2d,1743.0,Vipera renardi,Vipera [renardi],viperidae vipera ursinii renardi steppeviper r...,Viperidae: Vipera renardi renardi (Steppe Vipe...,https://farm2.staticflickr.com/1743/2761364125...,800.0,90612105.0
1,16.0,0.0,2018-05-16 18:24:55,0.0,0.0,"{'_content': 'Russia: Rostov Oblast, 6 km NW o...",1.0,0.0,0.0,0.0,...,6x9R3.VYUrzZVccn_Q,6354620178,886.0,Vipera renardi,Vipera [renardi],viperidae vipera ursinii renardi steppeviper r...,Viperidae: Vipera renardi renardi (Steppe Vipe...,https://farm1.staticflickr.com/886/27613634797...,800.0,90612105.0
2,16.0,0.0,2018-05-16 18:27:31,0.0,0.0,"{'_content': 'Russia: Rostov Oblast, 6 km NW o...",2.0,0.0,0.0,0.0,...,6x9R3.VYUrzZVccn_Q,7c9a11b846,1725.0,Vipera renardi,Vipera [renardi],viperidae vipera ursinii renardi steppeviper r...,Viperidae: Vipera renardi renardi (Steppe Vipe...,https://farm2.staticflickr.com/1725/4248392897...,800.0,90612105.0


In [10]:
#add var
df_all['saved_img_id'] = df_all.apply(lambda x: 'flickr_'+x['species']+'_'+str(x['id'])+".png",axis=1)
df_all['img_path'] = df_all.apply(lambda x: os.path.join(path_data,x['species'], x['saved_img_id']), axis=1)
print(df_all.shape)
#keeping only the images we truely have 
df_all = df_all[df_all['img_path'].isin(li_flickr_images)]
print(df_all.shape)
df_all.head(3)

(1885146, 33)
(1289598, 33)


Unnamed: 0,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,geo_is_family,geo_is_friend,...,server,species,species_word,tags,title,url_c,width_c,woeid,saved_img_id,img_path
0,16.0,0.0,2018-05-16 09:48:21,0.0,0.0,"{'_content': 'Russia: Rostov Oblast, 11.5 km W...",2.0,0.0,0.0,0.0,...,1743.0,Vipera renardi,Vipera [renardi],viperidae vipera ursinii renardi steppeviper r...,Viperidae: Vipera renardi renardi (Steppe Vipe...,https://farm2.staticflickr.com/1743/2761364125...,800.0,90612105.0,flickr_Vipera renardi_27613641257.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
1,16.0,0.0,2018-05-16 18:24:55,0.0,0.0,"{'_content': 'Russia: Rostov Oblast, 6 km NW o...",1.0,0.0,0.0,0.0,...,886.0,Vipera renardi,Vipera [renardi],viperidae vipera ursinii renardi steppeviper r...,Viperidae: Vipera renardi renardi (Steppe Vipe...,https://farm1.staticflickr.com/886/27613634797...,800.0,90612105.0,flickr_Vipera renardi_27613634797.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
2,16.0,0.0,2018-05-16 18:27:31,0.0,0.0,"{'_content': 'Russia: Rostov Oblast, 6 km NW o...",2.0,0.0,0.0,0.0,...,1725.0,Vipera renardi,Vipera [renardi],viperidae vipera ursinii renardi steppeviper r...,Viperidae: Vipera renardi renardi (Steppe Vipe...,https://farm2.staticflickr.com/1725/4248392897...,800.0,90612105.0,flickr_Vipera renardi_42483928971.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


In [11]:
#one line per id making the species_word columns a list, so that we know which image react to which
#species word (from the same species, otherwise will be already removed)
df_all1 = df_all.groupby('id')['species_word'].agg(lambda x: list(set(x))).reset_index()
df_all1 = df_all1.rename(columns={0:'species_word'})
print(df_all1.shape)
df_all2 = df_all.drop(['species_word'], inplace=False, axis=1).copy()
df_all2 = df_all2.drop_duplicates(subset=['id','species','img_path'], keep='first', inplace=False)
print(df_all2.shape)
df_all_final = pd.merge(df_all1, df_all2, how='outer', on='id')
print(df_all_final.shape)
df_all_final.head(3)

(57828, 2)
(57828, 32)
(57828, 33)


Unnamed: 0,id,species_word,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,...,secret,server,species,tags,title,url_c,width_c,woeid,saved_img_id,img_path
0,6796830748,"[l, Pseudoelaps atropolios, Aspidomorphus squa...",0.0,0.0,2012-02-29 19:14:05,0.0,0.0,"{'_content': 'Lamington National Park, Qld'}",8.0,,...,3b0faf2da2,7204.0,Cacophis squamulosus,,Cacophis squamulosus (Golden-crowned Snake),https://farm8.staticflickr.com/7204/6796830748...,800.0,,flickr_Cacophis squamulosus_6796830748.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
1,6797027978,[California kingsnake],0.0,0.0,2011-04-22 14:11:35,0.0,0.0,{'_content': 'Santa Clara County 4/22/2012'},8.0,,...,b5d546df2e,7203.0,Lampropeltis californiae,,California Kingsnake,https://farm8.staticflickr.com/7203/6797027978...,800.0,,flickr_Lampropeltis californiae_6797027978.png,/home/camille/vm_exchange/Lab/snakes/datasets/...
2,6799008978,[Emerald tree boa],0.0,0.0,2012-03-01 16:24:59,0.0,1.0,{'_content': 'emerald tree boa wpz'},8.0,,...,c93e0025bd,7181.0,Corallus caninus,park tree woodland zoo snake boa emerald seatl...,emerald tree boa wpz 100_0301R3,https://farm8.staticflickr.com/7181/6799008978...,800.0,,flickr_Corallus caninus_6799008978.png,/home/camille/vm_exchange/Lab/snakes/datasets/...


In [13]:
#see if for each image we have its info in df
d = len(li_flickr_images)-df_all_final.shape[0]
if d!=0:
    print('ERROR: we have %d images that does not appear in the df, we will remove them'%d)
    
#lets remove them
li_= [i for i in li_flickr_images if i not in df_all['img_path'].tolist()]
len(li_)   
li_flickr_images_= li_flickr_images.copy()
print(len(li_flickr_images))
for i in li_:
    #delete file
    os.remove(i)
    #removing also from the list
    li_flickr_images_.remove(i)
li_flickr_images = li_flickr_images_
print(len(li_flickr_images))

ERROR: we have 48 images that does not appear in the df


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [29]:
#verify that each image appear only once:
if max(list(df_all_final['id'].value_counts().values))!=1:
    print('ERROR: we have images that appear twice in teh df'%d)
    sys.exit()

In [None]:
#verify if we have the exact same amount of images and info
df_all_final.shape[0]!=len(li_flickr_images):
    print('ERROR: still not same amount of saved flickr images and info of flickr images')
    sys.exit()

In [33]:
#save metadata info (might not be of same size of number of collected images)
print(df_all_final.shape)
df_all_final.to_csv(os.path.join(path_data,'flickr_image_info.csv'), index=False, sep=';')

(57828, 33)


In [68]:
#Note on old data:
#4'637 out of 16'210 were not taken the second time, but can not be added due to missing url, longitude, lat. info
x = 4637/16210*100
print('%d percent of flickr image were not taken this second time due perhaps to people removing their images'%x)

28 percent of flickr image were not taken this second time due perhaps to people removing their images
