In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
import time
import pickle
import math

#to set connection with Flickr API
from flickrapi import FlickrAPI

#image
from PIL import Image

#url open to get image
import urllib.request
from urllib.request import urlopen
import ssl

#date
import datetime as dt
from datetime import datetime

#plot (for image verification)
import cv2
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
ROOT_DIR = os.getcwd()

In [3]:
#INFO on flickr
#for more parameter options: https://www.flickr.com/services/api/flickr.photos.search.html
#tags (Optional): A comma-delimited list of tags. Photos with one or more (or all tags by changing tags_mode)of the 
#tags listed will be returned. You can exclude results that match a term by prepending it with a - character.
#http://joequery.me/code/flickr-api-image-search-python/
#lisence info: https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html
#geolocalisation should not be used, as for example picture might be taken from a museum. we should add geolocalisation 
#based on 'biology' knowledge

# Parameter to change

#### type of images

In [4]:
type_ = 'oiseau_du_rhone'

#### list of species and synonymes

In [5]:
#The dataframe must at least have 'Species' columns for the name of each species, and a 'li_synonyms_final' columns with list of
#their synonyms without including itself

In [6]:
df_species = pd.read_csv(os.path.join(ROOT_DIR, 'oiseau_du_rhone','ListeOiseauxRhone27-05-2016_simpler.csv'), sep=';')
print(df_species.shape)
df_species.head(3)

(354, 2)


Unnamed: 0,Species,scientific_name
0,Cygne tuberculé,Cygnus olor
1,Cygne de Bewick,Cygnus columbianus
2,Cygne chanteur,Cygnus cygnus


In [7]:
#create a columns with their synonyms
df_species['li_synonyms_final'] = df_species['scientific_name'].map(lambda x: [x] if str(x)!='nan' else [])
#dont forget to do map(lambda x: eval(x)) if not saw as a list
df_species.head(3)

Unnamed: 0,Species,scientific_name,li_synonyms_final
0,Cygne tuberculé,Cygnus olor,[Cygnus olor]
1,Cygne de Bewick,Cygnus columbianus,[Cygnus columbianus]
2,Cygne chanteur,Cygnus cygnus,[Cygnus cygnus]


In [8]:
#verify the dataframe is in adequate format
if not 'Species' in df_species.columns:
    print('ERROR: you must have Species as a colomn name with the representative name for each species')
if not 'li_synonyms_final' in df_species.columns:
    print('ERROR: you must have li_synonyms_final as a colomn name with a list fo synonym for each species')

#### API parameters

In [9]:
#to get these data fllow the direction of: http://joequery.me/code/flickr-api-image-search-python/
FLICKR_PUBLIC = 'dd0cb0ced4e83452f8d49cb3d534707d'
FLICKR_SECRET = '4c568d2002b5506e'

In [10]:
flickr = FlickrAPI(FLICKR_PUBLIC, FLICKR_SECRET, format='parsed-json')
extras = 'description,geo,tags,url_c,owner_name,date_taken,license'

# Create fodlers

In [11]:
#in case to erase all
#shutil.rmtree(os.path.join(path_data,type_))

In [12]:
path_data = os.path.join(ROOT_DIR,'datasets',type_)
#create a director if not existing for images
if not os.path.exists(path_data):
    os.makedirs(path_data)

In [13]:
#create a list of species
li_species = df_species['Species'].tolist()
li_species = [x for x in li_species if str(x) != 'nan']
#verify unicity of species name
if len(li_species)!=len(set(li_species)):
    #search for the duplicate species
    df_ = df_species['Species'].value_counts().reset_index()
    li_duplicate = df_[df_['Species']!=1]['Species'].tolist()
    print('ERROR: non unique species name (%d species in total)'%len(li_duplicate))
    print('the following first 10 species appear more than ones: %s'%' \-\ '.join(li_duplicate[0:10]))
    sys.exit()
print('There is %d species'%len(li_species))

There is 350 species


In [14]:
#create a director if not existing for images
if type_ not in [x.split('\\')[-1] for x in glob.glob(os.path.join(ROOT_DIR,'*'))]:
    print('ERROR: your type images doe snot exist as a fodler in the Flickr folder')
    sys.exit()
p = os.path.join(ROOT_DIR,'datasets',type_)
if not os.path.exists(p):
    os.makedirs(p)

In [15]:
#create one folder per species folder if not existing
for species in li_species:
    folder_path_s = os.path.join(path_data,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)

# Check amount of flickr images

In [16]:
#keeping meta data of only the images we truely have 
#look at the actual image we really have
li_flickr_images = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_flickr_images.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')])
len(li_flickr_images)
print('We have %d images collected from Flickr'%len(li_flickr_images))

We have 495 images collected from Flickr


# Download image from flickr

In [17]:
#idea: collect all the image from the begining date, and until no more new image are outcome. In this way one can 
#rerun at anytime to grab only the new images

In [18]:
#choose starting date and we will take species that was not taken at this starting date
date = '17_01_2019' #in string otherwise might change if we run over two days
f = os.path.join(path_data,'li_species_done_'+date+'.pkl')
if len(glob.glob(f))>0:
    li_species_done = pickle.load(open(f, 'rb'))
else:
    li_species_done = []
print('%d percent of the species were already requested until date %s'%(len(li_species_done)/len(li_species)*100,
                                                                        date))

0 percent of the species were already requested until date 17_01_2019


In [19]:
#to overcome: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)>, form urlopen(url)
gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)

In [20]:
#download images form flickr
#Go in each species folder, and downlaod all the photos with a taken date greater than or equal to the maximum one 
#recorded in the species-metadata file if it exist, otherwise download it from the begining ("0000-00-00 00:00:00")
#While downloading an image, if there is an error from flickr stop the code (might be connection error). Then you 
#simply need to rerun it perhaps few minutes later
li_species_to_do = [x for x in li_species if x not in li_species_done]
print('We have %d species left to query for'%len(li_species_to_do))

for nbr, species in tqdm.tqdm(enumerate(li_species_to_do)):
    
    #save all previous species as done until that specific date
    li_species_done = li_species[0:nbr]
    pickle.dump(li_species_done, open(os.path.join(path_data,'li_species_done_'+date+'.pkl'), 'wb'))
    
    #list of synonyms for the species
    li_syn = df_species[df_species['Species']==species]['li_synonyms_final'].values[0] + [species]
    
    #initialize folder path for this species
    folder_path_s = os.path.join(path_data, species)
    
    #iterate through each species synonyms
    for species_word in li_syn:
        print(species_word)
        
        #initialization
        t = "0000-00-00 00:00:00"
        df_old = pd.DataFrame()
    
        #if the new collection of images is empty, then stop it, otherwise continue with the last taken date
        while True:

            #open the existing metadata file if any, and use the max taken date to grab data from that point instead
            old_meta_data_file = os.path.join(folder_path_s,'flickr_df_'+species+'.csv')
            if len(glob.glob(old_meta_data_file))>0:
                #date will be: Timestamp('2017-04-30 17:12:52')
                df_old = pd.read_csv(old_meta_data_file, parse_dates=['datetaken'], index_col=False, sep=';', 
                                     engine='python') #engine='python': to avoid OSError: Initializing from file failed
                #we need to take minus one day as min_taken_date is apparently working at day level, and before saving
                #we'll need to remove possibly duplicates (might happen if several picture taken the same day but we stop
                #at a "middle picture of the day"). Also we convert to good format for flickr query
                df_ = df_old[df_old['species_word']==species_word].copy()
                if df_.shape[0]>0:
                    t = (max(df_['datetaken'].tolist()) - dt.timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
                    #print('we will use as starting date %s'%str(t))
            
            #take at most 10 times if their is a connection error connection error )
            k = 0
            while k<10:
                try:
                    image_data = flickr.photos.search(text='\"'+species+'\"', content_type=1, media="photos", 
                                                      per_page=500, extras=extras, min_taken_date=t)
                    k = 10
                except KeyboardInterrupt:
                    raise
                except ConnectionError as e:
                    k = k+1
                    print('DOWNLOAD ISSUE for species %s, due to error: %s, lets SLEEP'%(species,e))
                    # sleep for 5 seconds
                    time.sleep(5)
                    
                except Exception as e:
                    print('DOWNLOAD ISSUE for species %s, due to error: %s, lets STOP'%(species, e))
                    image_data = None
                    k = 10
                    
            if image_data==None:
                print('image is none get out of loop')
                break
                
            #download image with its url and save it if its a new one
            for i, photo in enumerate(image_data['photos']['photo']): #besides photos there is only a 'stat' key
                if 'url_c' in photo:
                    url = photo['url_c']
                    try:
                        if len(glob.glob(os.path.join(folder_path_s,species+'_'+photo['id']+".png")))==0:
                            img = Image.open(urlopen(url, context=gcontext))
                            img.save(os.path.join(folder_path_s,species+'_'+photo['id']+".png"))
                            del img
                    except KeyboardInterrupt:
                        raise
                    except Exception as e:
                        print(e)
                        print('SAVE ISSUE for species %s and url %s'%(species,str(url)))

            #create new metadata file with all the images (old and new)
            df_new = pd.DataFrame(image_data['photos']['photo'])
            df_new['species_word'] = species_word
            df = pd.concat([df_old, df_new], ignore_index=True)

            #save and remove duplicates (first, uniform the id type(as when we save and open the str get converted 
            #to int))
            if df.shape[0]>0:
                df['id'] = df['id'].map(lambda x: int(x))
                #drop duplicates due to dates that must overlap when re-query data for the second time
                #we keep trace of each image evn if its already find for another syn, in this way we would directly know
                #which image respond ti which species-word, and also which last-taken date correspond to which species
                df = df.drop_duplicates(subset=['id', 'species_word'], keep='first', inplace=False)
                #save metadata for each images of this species (note: might be empty if no images was collected)
                df.to_csv(os.path.join(folder_path_s,'flickr_df_'+species+'.csv'), index=False, sep=';')

            #print(df_old.shape,df_new.shape,df.shape) #to debug
            #if there was already data collected and the new one brought some more data (not tru now that we have several
            #names per species)
            #if (df_old.shape[0]>0) and (df.shape[0]>df_old.shape[0]):
            #    print('species %s needed two collected data'%species)
            #if no more data was bring last time
            if df.shape[0]==df_old.shape[0]: #wrong: df_new.shape[0]==0: indeed we can gather images that are already (-1d)
                del image_data
                break

#do_request: Status code 502 received
#('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
#tatus code 500 received : problem is with the website itself. 

We have 350 species left to query for


0it [00:00, ?it/s]

Cygnus olor





OSError: Initializing from file failed

## create one csv file with all metadata info from each species

In [None]:
df_all = pd.DataFrame()
li_df = []
for species in tqdm.tqdm(glob.glob(os.path.join(path_ml,'datasets','flickr_images','*'))):
    csv_f = glob.glob(os.path.join(species,'*.csv'))
    if len(csv_f)==1:
        df = pd.read_csv(csv_f[0], sep=';', index_col=False)
        df['species'] = species.split('/')[-1]
        li_df.append(df)
        del df
df_all = pd.concat(li_df, ignore_index=True)
df_all.head(3)

In [None]:
df_all['img_path'] = df_all.apply(lambda x: os.path.join(folder_path,x['species'], 
                                                         x['species']+'_'+str(x['id'])+".png"), axis=1)
print(df_all.shape)
#keeping only the images we truely have 
df_all = df_all[df_all['img_path'].isin(li_flickr_images)]
print(df_all.shape)
df_all.head(3)

In [None]:
#save metadata info (might not be of same size of number of collected images)
df_all.to_csv(os.path.join(path_data,'flickr_image_info.csv'),index=False,sep=';')