In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil
import time
import pickle
import math

#to set connection with Flickr API
from flickrapi import FlickrAPI

#image
from PIL import Image

#url open to get image
import urllib.request
from urllib.request import urlopen
import ssl

#date
import datetime as dt
from datetime import datetime

#plot (for image verification)
import cv2
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
ROOT_DIR = os.getcwd()

In [3]:
#INFO on flickr
#for more parameter options: https://www.flickr.com/services/api/flickr.photos.search.html
#tags (Optional): A comma-delimited list of tags. Photos with one or more (or all tags by changing tags_mode)of the 
#tags listed will be returned. You can exclude results that match a term by prepending it with a - character.
#http://joequery.me/code/flickr-api-image-search-python/
#lisence info: https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html
#geolocalisation should not be used, as for example picture might be taken from a museum. we should add geolocalisation 
#based on 'biology' knowledge

# Parameter to change

#### type of images

In [4]:
type_ = 'oiseau_du_rhone'

#### list of species and synonymes

In [5]:
#The dataframe must at least have 'Species' columns for the name of each species, and a 'li_synonyms_final' columns with list of
#their synonyms without including itself

In [6]:
df_species = pd.read_csv(os.path.join(ROOT_DIR, type_,'ListeOiseauxRhone27-05-2016_simpler.csv'), sep=';')
print(df_species.shape)
df_species.head(3)

(354, 2)


Unnamed: 0,Species,scientific_name
0,Cygne tuberculé,Cygnus olor
1,Cygne de Bewick,Cygnus columbianus
2,Cygne chanteur,Cygnus cygnus


In [7]:
#create a columns with their synonyms
df_species['li_synonyms_final'] = df_species['scientific_name'].map(lambda x: [x] if str(x)!='nan' else [])
#dont forget to do map(lambda x: eval(x)) if not saw as a list
df_species.head(3)

Unnamed: 0,Species,scientific_name,li_synonyms_final
0,Cygne tuberculé,Cygnus olor,[Cygnus olor]
1,Cygne de Bewick,Cygnus columbianus,[Cygnus columbianus]
2,Cygne chanteur,Cygnus cygnus,[Cygnus cygnus]


In [8]:
#verify the dataframe is in adequate format
if not 'Species' in df_species.columns:
    print('ERROR: you must have Species as a colomn name with the representative name for each species')
if not 'li_synonyms_final' in df_species.columns:
    print('ERROR: you must have li_synonyms_final as a colomn name with a list fo synonym for each species')

#### choose encoding

In [9]:
#depend mostly on language use
#for french
encoding_ = 'utf-8 '#'ISO-8859-1', 'utf-8'

#### API parameters

In [10]:
#to get these data fllow the direction of: http://joequery.me/code/flickr-api-image-search-python/
FLICKR_PUBLIC = 'dd0cb0ced4e83452f8d49cb3d534707d'
FLICKR_SECRET = '4c568d2002b5506e'

In [11]:
flickr = FlickrAPI(FLICKR_PUBLIC, FLICKR_SECRET, format='parsed-json')
extras = 'description,geo,tags,url_c,owner_name,date_taken,license'

# Create fodlers

In [12]:
#in case to erase all
#shutil.rmtree(os.path.join(path_data,type_))

In [13]:
path_data = os.path.join(ROOT_DIR,'datasets',type_)
#create a director if not existing for images
if not os.path.exists(path_data):
    os.makedirs(path_data)

In [14]:
#create a list of species
li_species = df_species['Species'].tolist()
li_species = [x for x in li_species if str(x) != 'nan']
#verify unicity of species name
if len(li_species)!=len(set(li_species)):
    #search for the duplicate species
    df_ = df_species['Species'].value_counts().reset_index()
    li_duplicate = df_[df_['Species']!=1]['Species'].tolist()
    print('ERROR: non unique species name (%d species in total)'%len(li_duplicate))
    print('the following first 10 species appear more than ones: %s'%' \-\ '.join(li_duplicate[0:10]))
    sys.exit()
print('There is %d species'%len(li_species))

There is 350 species


In [15]:
#create a director if not existing for images
if type_ not in [x.split('\\')[-1] for x in glob.glob(os.path.join(ROOT_DIR,'*'))]:
    print('ERROR: your type images doe snot exist as a fodler in the Flickr folder')
    sys.exit()
p = os.path.join(ROOT_DIR,'datasets',type_)
if not os.path.exists(p):
    os.makedirs(p)

In [16]:
#create one folder per species folder if not existing
for species in li_species:
    folder_path_s = os.path.join(path_data,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)

# Check amount of flickr images

In [26]:
#keeping meta data of only the images we truely have 
#look at the actual image we really have
li_flickr_images = []
for species in glob.glob(os.path.join(path_data,'*')):
    li_flickr_images.extend([x for x in glob.glob(os.path.join(species,'*')) if x.endswith('.png')])
len(li_flickr_images)
print('We have %d images collected from Flickr'%len(li_flickr_images))

We have 104452 images collected from Flickr


# Download image from flickr

In [18]:
#idea: collect all the image from the begining date, and until no more new image are outcome. In this way one can 
#rerun at anytime to grab only the new images

In [19]:
#choose starting date and we will take species that was not taken at this starting date
date = '17_01_2019' #in string otherwise might change if we run over two days
f = os.path.join(path_data,'li_species_done_'+date+'.pkl')
if len(glob.glob(f))>0:
    li_species_done = pickle.load(open(f, 'rb'))
else:
    li_species_done = []
print('%d percent of the species were already requested until date %s'%(len(li_species_done)/len(li_species)*100,
                                                                        date))

0 percent of the species were already requested until date 17_01_2019


In [20]:
f = os.path.join(path_data,'li_species_done_'+date+'.pkl')

In [21]:
#to overcome: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)>, form urlopen(url)
gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)

In [22]:
#download images form flickr
#Go in each species folder, and downlaod all the photos with a taken date greater than or equal to the maximum one 
#recorded in the species-metadata file if it exist, otherwise download it from the begining ("0000-00-00 00:00:00")
#While downloading an image, if there is an error from flickr stop the code (might be connection error). Then you 
#simply need to rerun it perhaps few minutes later
li_species_to_do = [x for x in li_species if x not in li_species_done]
print('We have %d species left to query for'%len(li_species_to_do))

for nbr, species in tqdm.tqdm(enumerate(li_species_to_do)):
    
    #save all previous species as done until that specific date
    li_species_done = li_species[0:nbr]
    pickle.dump(li_species_done, open(os.path.join(path_data,'li_species_done_'+date+'.pkl'), 'wb'))
    
    #list of synonyms for the species
    li_syn = df_species[df_species['Species']==species]['li_synonyms_final'].values[0] + [species]
    
    #initialize folder path for this species
    folder_path_s = os.path.join(path_data, species)
    
    #iterate through each species synonyms
    for species_word in li_syn:
        print(species_word)
        
        #initialization
        t = "0000-00-00 00:00:00"
        df_old = pd.DataFrame()
    
        #if the new collection of images is empty, then stop it, otherwise continue with the last taken date
        while True:

            #open the existing metadata file if any, and use the max taken date to grab data from that point instead
            old_meta_data_file = os.path.join(folder_path_s,'flickr_df_'+species+'.csv')
            if len(glob.glob(old_meta_data_file))>0:
                #date will be: Timestamp('2017-04-30 17:12:52')
                df_old = pd.read_csv(old_meta_data_file, parse_dates=['datetaken'], index_col=False, sep=';', 
                                     engine='python', encoding=encoding_) #engine='python':avoid OSError: Initializing from file failed
                #we need to take minus one day as min_taken_date is apparently working at day level, and before saving
                #we'll need to remove possibly duplicates (might happen if several picture taken the same day but we stop
                #at a "middle picture of the day"). Also we convert to good format for flickr query
                df_ = df_old[df_old['species_word']==species_word].copy()
                if df_.shape[0]>0:
                    t = (max(df_['datetaken'].tolist()) - dt.timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
                    #print('we will use as starting date %s'%str(t))
            
            #take at most 10 times if their is a connection error connection error )
            k = 0
            while k<10:
                try:
                    image_data = flickr.photos.search(text='\"'+species+'\"', content_type=1, media="photos", 
                                                      per_page=500, extras=extras, min_taken_date=t)
                    k = 10
                except KeyboardInterrupt:
                    raise
                except ConnectionError as e:
                    k = k+1
                    print('DOWNLOAD ISSUE for species %s, due to error: %s, lets SLEEP'%(species,e))
                    # sleep for 5 seconds
                    time.sleep(5)
                    
                except Exception as e:
                    print('DOWNLOAD ISSUE for species %s, due to error: %s, lets STOP'%(species, e))
                    image_data = None
                    k = 10
                    
            if image_data==None:
                print('image is none get out of loop')
                break
                
            #download image with its url and save it if its a new one
            for i, photo in enumerate(image_data['photos']['photo']): #besides photos there is only a 'stat' key
                if 'url_c' in photo:
                    url = photo['url_c']
                    try:
                        if len(glob.glob(os.path.join(folder_path_s,species+'_'+photo['id']+".png")))==0:
                            img = Image.open(urlopen(url, context=gcontext))
                            img.save(os.path.join(folder_path_s,species+'_'+photo['id']+".png"))
                            del img
                    except KeyboardInterrupt:
                        raise
                    except Exception as e:
                        print(e)
                        print('SAVE ISSUE for species %s and url %s'%(species,str(url)))

            #create new metadata file with all the images (old and new)
            df_new = pd.DataFrame(image_data['photos']['photo'])
            df_new['species_word'] = species_word
            #deal with encoding (try at least)
            if df_new.shape[0]>0:
                df_new['title'] = df_new['title'].map(lambda x: x.encode(encoding_))
                df_new['tags'] = df_new['tags'].map(lambda x: x.encode(encoding_))
                df_new['description'] = df_new['description'].map(lambda x: x['_content'].encode(encoding_))
            df = pd.concat([df_old, df_new], ignore_index=True)

            #save and remove duplicates (first, uniform the id type(as when we save and open the str get converted 
            #to int))
            if df.shape[0]>0:
                df['id'] = df['id'].map(lambda x: int(x))
                #drop duplicates due to dates that must overlap when re-query data for the second time
                #we keep trace of each image evn if its already find for another syn, in this way we would directly know
                #which image respond ti which species-word, and also which last-taken date correspond to which species
                df = df.drop_duplicates(subset=['id'], keep='first', inplace=False)
                #remove all info for rows that does not correspond to one of our saved images (but keep date for tracking)
                li_saved_img_id = [x.split('\\')[-1].split('_')[-1].split('.')[0] for x in glob.glob(os.path.join(path_data,
                                                                                                             species,
                                                                                                             '*.png'))]
                #li_col_nan = [x for x in list(df_new.columns) if x not in ['datetaken', 'id', 'species_word', 'place_id']]
                #df.loc[df['id'].isin(li_saved_img_id), li_col_nan] = None
                #directly remove all rows that does not match a saved image
                df = df[df['id'].isin(li_saved_img_id)]
                #save metadata for each images of this species (note: might be empty if no images was collected)
                df.to_csv(os.path.join(folder_path_s,'flickr_df_'+species+'.csv'), index=False, sep=';',encoding=encoding_)
            
            #print(df_old.shape,df_new.shape,df.shape) #to debug
            #if there was already data collected and the new one brought some more data (not tru now that we have several
            #names per species)
            #if (df_old.shape[0]>0) and (df.shape[0]>df_old.shape[0]):
            #    print('species %s needed two collected data'%species)
            #if no more data was bring last time pass to next species_word
            if df.shape[0]==df_old.shape[0]: #wrong: df_new.shape[0]==0: indeed we can gather images that are already (-1d)
                del image_data
                break

#do_request: Status code 502 received
#('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
#status code 500 received : problem is with the website itself. 

We have 350 species left to query for


0it [00:00, ?it/s]

Cygnus olor
Cygne tuberculé


1it [00:08,  8.52s/it]

Cygnus columbianus
Cygne de Bewick


2it [00:11,  6.80s/it]

Cygnus cygnus
Cygne chanteur


3it [00:21,  7.69s/it]

Anser fabalis
Oie des moissons


4it [00:22,  5.78s/it]

Anser brachyrhynchus
Oie à bec court


5it [00:24,  4.71s/it]

Anser albifrons
Oie rieuse


6it [00:30,  4.98s/it]

Anser anser
Oie cendrée


7it [00:43,  7.33s/it]

Branta leucopsis
Bernache nonnette


8it [00:48,  6.75s/it]

Branta bernicla
Bernache cravant


9it [01:09, 10.93s/it]

Branta canadensis
Bernache du Canada


10it [01:30, 14.12s/it]

Alopochen aegyptiaca
Ouette d'Egypte


11it [01:55, 17.28s/it]

Tadorna ferruginea
Tadorne casarca


12it [02:09, 16.20s/it]

Tadorna tadorna
Tadorne de Belon


13it [02:13, 12.55s/it]

Anas penelope
Canard siffleur


14it [02:19, 10.78s/it]

Anas strepera
Canard chipeau


15it [02:34, 12.00s/it]

Anas crecca
Sarcelle d'hiver


16it [02:49, 12.89s/it]

Anas platyrhynchos
Canard colvert


17it [03:19, 17.98s/it]

Anas acuta
Canard pilet


18it [03:44, 20.08s/it]

Anas querquedula
Sarcelle d'été


19it [03:48, 15.39s/it]

Anas clypeata
Canard souchet


20it [03:51, 11.73s/it]

Netta rufina
Nette rousse


21it [03:53,  8.72s/it]

Aythya ferina
Fuligule milouin


22it [04:06,  9.96s/it]

Aythya collaris
Fuligule à bec cerclé


23it [04:10,  8.05s/it]

Aythya nyroca
Fuligule nyroca


24it [04:14,  6.91s/it]

Aythya fuligula
Fuligule morillon


25it [04:16,  5.56s/it]

Aythya marila
Fuligule milouinan


26it [04:25,  6.53s/it]

Somateria mollissima
Eider à duvet


27it [04:41,  9.37s/it]

Oxyura jamaicensis
Érismature rousse


28it [04:56, 11.17s/it]

Oxyura leucocephala
Érismature à tête blanche


29it [05:04,  9.94s/it]

Clangula hyemalis
Harelde boréale


30it [05:12,  9.51s/it]

Melanitta nigra
Macreuse noire


31it [05:16,  7.92s/it]

Melanitta fusca
Macreuse brune


32it [05:17,  5.89s/it]

Bucephala clangula
Garrot à oeil d’or


33it [05:20,  4.79s/it]

Mergellus albellus
Harle piette


34it [05:22,  4.11s/it]

Mergus serrator
Harle huppé


35it [05:28,  4.73s/it]

Mergus merganser
Harle bièvre


36it [05:41,  7.18s/it]

Alectoris rufa
Perdrix rouge


37it [05:58, 10.06s/it]

Perdix perdix
Perdrix grise


38it [06:09, 10.43s/it]

Coturnix coturnix
Caille des blés


39it [06:12,  8.04s/it]

Phasianus colchicus
Faisan de Colchide


40it [06:33, 11.96s/it]

Syrmaticus reevesii
Faisan vénéré


41it [06:34,  8.68s/it]

Gavia stellata
Plongeon catmarin


42it [06:41,  8.11s/it]

Gavia arctica
Plongeon arctique


43it [06:44,  6.81s/it]

Gavia immer
Plongeon imbrin


44it [06:54,  7.72s/it]

Tachybaptus ruficollis
Grèbe castagneux


45it [06:59,  6.70s/it]

Podiceps cristatus
Grèbe huppé


46it [07:08,  7.64s/it]

Podiceps grisegena
Grèbe jougris


47it [07:14,  6.94s/it]

Podiceps auritus
Grèbe esclavon


48it [07:17,  5.72s/it]

Podiceps nigricollis
Grèbe à cou noir


49it [07:39, 10.62s/it]

Calonectris diomedea
Puffin cendré


50it [07:45,  9.19s/it]

Fulmarus glacialis
Fulmar boréal


51it [07:58, 10.60s/it]

Hydrobates pelagicus
Océanite tempête


52it [08:00,  7.82s/it]

Phalacrocorax carbo
Grand Cormoran


53it [08:30, 14.50s/it]

Microcarbo pygmeus
Cormoran pygmée


54it [08:31, 10.64s/it]

Botaurus stellaris
Butor étoilé


55it [08:45, 11.43s/it]

Ixobrychus minutus
Blongios nain


56it [09:00, 12.51s/it]

Nycticorax nycticorax
Bihoreau gris


57it [17:32, 162.40s/it]

Ardeola ralloides
Crabier chevelu


58it [17:42, 116.64s/it]

Bubulcus ibis
Héron garde-boeufs


59it [18:01, 87.39s/it] 

Egretta garzetta
Aigrette garzette


60it [18:29, 69.65s/it]

Ardea alba
Grande Aigrette


61it [26:27, 192.23s/it]

Ardea cinerea
Héron cendré


62it [33:42, 265.05s/it]

Ardea purpurea
Héron pourpré


63it [33:44, 186.12s/it]

Ciconia nigra
Cigogne noire


64it [33:47, 131.22s/it]

Ciconia ciconia
Cigogne blanche


65it [33:50, 92.71s/it] 

Plegadis falcinellus
Ibis falcinelle


66it [34:04, 69.06s/it]

Geronticus eremita
Ibis chauve


67it [34:09, 49.78s/it]

Platalea leucorodia
Spatule blanche


68it [34:10, 35.23s/it]

Phoenicopterus roseus
Flamant rose


69it [34:50, 36.66s/it]

Pernis apivorus
Bondrée apivore


70it [35:05, 30.11s/it]

Elanus caeruleus
Élanion blanc


71it [35:17, 24.67s/it]

Milvus migrans
Milan noir


72it [35:20, 18.01s/it]

Milvus milvus
Milan royal


73it [35:23, 13.61s/it]

Gypaetus barbatus
Gypaète barbu


74it [35:38, 14.00s/it]

Neophron percnopterus
Vautour percnoptère


75it [35:39, 10.23s/it]

Gyps fulvus
Vautour fauve


76it [35:52, 11.07s/it]

Aegypius monachus
Vautour moine


77it [35:59,  9.64s/it]

Circaetus gallicus
Circaète Jean-le-Blanc


78it [36:12, 10.84s/it]

Circus aeruginosus
Busard des roseaux


79it [36:24, 11.24s/it]

Circus cyaneus
Busard Saint-Martin


80it [36:46, 14.23s/it]

Circus pygargus
Busard cendré


81it [36:54, 12.35s/it]

Accipiter gentilis
Autour des palombes


82it [36:59, 10.14s/it]

Accipiter nisus
Épervier d'Europe


83it [37:17, 12.52s/it]

Buteo buteo
Buse variable


84it [37:18,  9.34s/it]

Buteo rufinus
Buse féroce


85it [37:21,  7.23s/it]

Haliaeetus albicilla
Pygargue à queue blanche


86it [37:29,  7.51s/it]

Hieraaetus pennatus
Aigle botté


87it [37:30,  5.71s/it]

Aquila chrysaetos
Aigle royal


88it [37:46,  8.77s/it]

Aquila clanga
Aigle criard


89it [37:48,  6.59s/it]

Pandion haliaetus
Balbuzard pêcheur


90it [38:06,  9.92s/it]

Falco tinnunculus
Faucon crécerelle


91it [38:19, 11.07s/it]

Falco vespertinus
Faucon kobez


92it [38:28, 10.48s/it]

Falco columbarius
Faucon émerillon


93it [38:46, 12.47s/it]

Falco subbuteo
Faucon hobereau


94it [38:48,  9.61s/it]

Falco peregrinus
Faucon pèlerin


95it [38:51,  7.55s/it]

Rallus aquaticus
Râle d'eau


96it [38:53,  5.81s/it]

Porzana porzana
Marouette ponctuée


97it [38:56,  5.03s/it]

Porzana parva
Marouette poussin


98it [39:00,  4.57s/it]

Porzana pusilla
Marouette de Baillon


99it [39:01,  3.50s/it]

Crex crex
Râle des genêts


100it [39:02,  2.75s/it]

Gallinula chloropus
Gallinule poule-d'eau


101it [39:15,  5.79s/it]

Fulica atra
Foulque macroule


102it [39:20,  5.56s/it]

Porphyrio porphyrio
Talève sultane


103it [39:32,  7.67s/it]

Grus grus
Grue cendrée


104it [39:36,  6.54s/it]

Tetrax tetrax
Outarde canepetière


105it [39:43,  6.56s/it]

Chlamydotis macqueenii
Outarde de Macqueen


106it [39:46,  5.47s/it]

Otis tarda
Outarde barbue


107it [39:50,  4.99s/it]

Haematopus ostralegus
Huîtrier pie


108it [39:51,  4.09s/it]

Himantopus himantopus
Échasse blanche


109it [39:53,  3.34s/it]

Recurvirostra avosetta
Avocette élégante


110it [39:58,  3.90s/it]

Burhinus oedicnemus
Oedicnème criard


111it [40:03,  4.04s/it]

Glareola pratincola
Glaréole à collier


112it [40:08,  4.50s/it]

Charadrius dubius
Petit Gravelot


113it [40:11,  4.01s/it]

Charadrius hiaticula
Grand Gravelot


114it [40:17,  4.72s/it]

Charadrius alexandrinus
Gravelot à collier interrompu


115it [40:22,  4.58s/it]

Charadrius morinellus
Pluvier guignard


116it [40:26,  4.39s/it]

Pluvialis apricaria
Pluvier doré


117it [40:31,  4.59s/it]

Pluvialis squatarola
Pluvier argenté


118it [40:33,  3.78s/it]

Vanellus vanellus
Vanneau huppé


119it [40:34,  3.15s/it]

Calidris canutus
Bécasseau maubèche


120it [40:39,  3.50s/it]

Calidris alba
Bécasseau sanderling


121it [40:40,  2.97s/it]

Calidris minuta
Bécasseau minute


122it [40:42,  2.56s/it]

Calidris temminckii
Bécasseau de Temminck


123it [40:43,  2.25s/it]

Calidris ferruginea
Bécasseau cocorli


124it [40:45,  1.94s/it]

Calidris alpina
Bécasseau variable


125it [40:48,  2.30s/it]

Calidris melanotos
Bécasseau tacheté


126it [40:49,  2.06s/it]

Philomachus pugnax
Combattant varié


127it [40:53,  2.39s/it]

Lymnocryptes minimus
Bécassine sourde


128it [40:55,  2.43s/it]

Gallinago gallinago
Bécassine des marais


129it [40:57,  2.14s/it]

Gallinago media
Bécassine double


130it [40:57,  1.79s/it]

Scolopax rusticola
Bécasse des bois


131it [40:58,  1.56s/it]

Limosa limosa
Barge à queue noire


132it [41:13,  5.34s/it]

Limosa lapponica
Barge rousse


133it [41:18,  5.41s/it]

Numenius phaeopus
Courlis corlieu


134it [41:33,  8.10s/it]

Numenius arquata
Courlis cendré


135it [41:40,  7.98s/it]

Tringa erythropus
Chevalier arlequin


136it [41:48,  7.94s/it]

Tringa totanus
Chevalier gambette


137it [42:05, 10.52s/it]

Tringa nebularia
Chevalier aboyeur


138it [42:06,  7.85s/it]

Tringa stagnatilis
Chevalier stagnatile


139it [42:09,  6.29s/it]

Tringa ochropus
Chevalier culblanc


140it [42:28, 10.25s/it]

Tringa glareola
Chevalier sylvain


141it [42:30,  7.77s/it]

Actitis hypoleucos
Chevalier guignette


142it [42:42,  8.79s/it]

Arenaria interpres
Tournepierre à collier


143it [43:06, 13.51s/it]

Phalaropus fulicarius
Phalarope à bec large


144it [43:11, 11.06s/it]

Stercorarius pomarinus
Labbe pomarin


145it [43:13,  8.18s/it]

Stercorarius skua
Grand Labbe


146it [43:21,  8.17s/it]

Stercorarius longicaudus
Labbe à longue queue


147it [43:23,  6.17s/it]

Ichthyaetus melanocephalus
Mouette mélanocéphale


148it [43:30,  6.55s/it]

Leucophaeus atricilla
Mouette atricille


149it [43:41,  7.82s/it]

Leucophaeus pipixcan
Mouette de Franklin


150it [43:43,  6.01s/it]

Hydrocoloeus minutus
Mouette pygmée


151it [43:50,  6.34s/it]

Chroicocephalus ridibundus
Mouette rieuse


152it [43:52,  5.10s/it]

Chroicocephalus genei
Goéland railleur


153it [43:54,  4.32s/it]

Ichthyaetus audouinii
Goéland d'Audouin


154it [43:57,  3.67s/it]

Larus canus
Goéland cendré


155it [44:01,  3.78s/it]

Larus michahellis
Goéland leucophée


156it [44:06,  4.41s/it]

Larus cachinnans
Goéland pontique


157it [44:09,  3.87s/it]

Larus fuscus
Goéland brun


158it [44:43, 12.89s/it]

Larus argentatus
Goéland argenté


159it [44:59, 13.69s/it]

Rissa tridactyla
Mouette tridactyle


160it [45:18, 15.36s/it]

Sternula albifrons
Sterne naine


161it [45:24, 12.52s/it]

Gelochelidon nilotica
Sterne hansel


162it [45:28, 10.04s/it]

Hydroprogne caspia
Sterne caspienne


163it [45:40, 10.73s/it]

Chlidonias hybrida
Guifette moustac


164it [45:51, 10.84s/it]

Chlidonias niger
Guifette noire


165it [45:56,  8.84s/it]

Chlidonias leucopterus
Guifette leucoptère


166it [45:57,  6.53s/it]

Thalasseus sandvicensis
Sterne caugek


167it [46:17, 10.64s/it]

Thalasseus bengalensis
Sterne voyageuse


168it [46:18,  7.78s/it]

Sterna hirundo
Sterne pierregarin


169it [46:19,  5.86s/it]

Sterna paradisaea
Sterne arctique


170it [46:21,  4.60s/it]

Pterocles alchata
Ganga cata


171it [46:22,  3.59s/it]

Columba livia domesticus
Pigeon biset domestique


172it [46:25,  3.20s/it]

Columba oenas
Pigeon colombin


173it [46:29,  3.50s/it]

Columba palumbus
Pigeon ramier


174it [46:41,  6.24s/it]

Streptopelia decaocto
Tourterelle turque


175it [46:52,  7.51s/it]

Streptopelia turtur
Tourterelle des bois


176it [46:57,  6.65s/it]

Psittacula krameri
Perruche à collier


177it [46:58,  5.08s/it]

Cuculus canorus
Coucou gris


178it [47:02,  4.80s/it]

Clamator glandarius
Coucou geai


179it [47:05,  4.20s/it]

Tyto alba
Effraie des clochers


180it [47:14,  5.61s/it]

Otus scops
Petit-duc scops


181it [47:16,  4.63s/it]

Bubo bubo
Grand-duc d'Europe


182it [47:22,  4.92s/it]

Athene noctua
Chevêche d'Athéna


183it [47:41,  9.20s/it]

Strix aluco
Chouette hulotte


184it [47:54, 10.25s/it]

Asio otus
Hibou moyen-duc


185it [48:12, 12.72s/it]

Asio flammeus
Hibou des marais


186it [48:29, 14.04s/it]

Aegolius funereus
Chouette de Tengmalm


187it [48:31, 10.32s/it]

Caprimulgus europaeus
Engoulevent d'Europe


188it [48:35,  8.40s/it]

Tachymarptis melba
Martinet à ventre blanc


189it [48:38,  6.87s/it]

Apus apus
Martinet noir


190it [48:49,  8.13s/it]

Apus pallidus
Martinet pâle


191it [48:50,  6.09s/it]

Alcedo atthis
Martin-pêcheur d'Europe


192it [48:52,  4.76s/it]

Merops apiaster
Guêpier d'Europe


193it [49:33, 15.74s/it]

Coracias garrulus
Rollier d'Europe


194it [49:38, 12.40s/it]

Upupa epops
Huppe fasciée


195it [49:51, 12.57s/it]

Jynx torquilla
Torcol fourmilier


196it [49:58, 10.95s/it]

Picus canus
Pic cendré


197it [50:00,  8.05s/it]

Picus viridis
Pic vert


198it [50:19, 11.63s/it]

Dryocopus martius
Pic noir


199it [50:32, 11.97s/it]

Dendrocopos major
Pic épeiche


200it [50:47, 12.80s/it]

Dendrocopos medius
Pic mar


201it [50:49,  9.42s/it]

Dendrocopos minor
Pic épeichette


202it [50:51,  7.33s/it]

Calandrella brachydactyla
Alouette calandrelle


203it [50:53,  5.61s/it]

Galerida cristata
Cochevis huppé


204it [50:57,  5.10s/it]

Lullula arborea
Alouette lulu


205it [50:59,  4.18s/it]

Alauda arvensis
Alouette des champs


206it [51:01,  3.63s/it]

Riparia riparia
Hirondelle de rivage


207it [51:14,  6.41s/it]

Ptyonoprogne rupestris
Hirondelle de rochers


208it [51:19,  6.04s/it]

Hirundo rustica
Hirondelle rustique


209it [51:36,  9.34s/it]

Delichon urbicum
Hirondelle de fenêtre


210it [51:47,  9.92s/it]

Anthus campestris
Pipit rousseline


211it [51:52,  8.39s/it]

Anthus trivialis
Pipit des arbres


212it [51:59,  7.90s/it]

Anthus pratensis
Pipit farlouse


213it [52:14, 10.12s/it]

Anthus cervinus
Pipit à gorge rousse


214it [52:15,  7.47s/it]

Anthus spinoletta
Pipit spioncelle


215it [52:19,  6.42s/it]

Anthus richardi
Pipit de Richard


216it [52:20,  4.81s/it]

Motacilla flava
Bergeronnette printanière


217it [52:30,  6.35s/it]

Motacilla cinerea
Bergeronnette des ruisseaux


218it [52:32,  4.88s/it]

Motacilla alba
Bergeronnette grise


219it [52:48,  8.16s/it]

Bombycilla garrulus
Jaseur boréal


220it [53:07, 11.53s/it]

Cinclus cinclus
Cincle plongeur


221it [53:10,  9.02s/it]

Troglodytes troglodytes
Troglodyte mignon


222it [53:24, 10.44s/it]

Prunella modularis
Accenteur mouchet


223it [53:39, 11.69s/it]

Prunella collaris
Accenteur alpin


224it [53:55, 13.17s/it]

Erithacus rubecula
Rougegorge familier


225it [54:44, 23.73s/it]

Luscinia megarhynchos
Rossignol philomèle


226it [54:55, 20.05s/it]

Luscinia svecica
Gorgebleue à miroir


227it [54:58, 14.78s/it]

Phoenicurus ochruros
Rougequeue noir


228it [55:20, 17.06s/it]

Phoenicurus phoenicurus
Rougequeue à front blanc


229it [55:22, 12.47s/it]

Saxicola rubetra
Tarier des prés


230it [55:25,  9.82s/it]

Saxicola rubicola
Tarier pâtre


231it [55:45, 12.84s/it]

Oenanthe oenanthe
Traquet motteux


232it [56:04, 14.57s/it]

Oenanthe hispanica
Traquet oreillard


233it [56:08, 11.54s/it]

Monticola saxatilis
Monticole de roche


234it [56:10,  8.72s/it]

Turdus torquatus
Merle à plastron


235it [56:15,  7.51s/it]

Turdus merula
Merle noir


236it [56:34, 11.06s/it]

Turdus pilaris
Grive litorne


237it [56:37,  8.44s/it]

Turdus philomelos
Grive musicienne


238it [57:01, 13.12s/it]

Turdus iliacus
Grive mauvis


239it [57:04, 10.22s/it]

Turdus viscivorus
Grive draine


240it [57:12,  9.64s/it]

Cettia cetti
Bouscarle de Cetti


241it [57:18,  8.31s/it]

Cisticola juncidis
Cisticole des joncs


242it [57:22,  7.24s/it]

Locustella naevia
Locustelle tachetée


243it [57:24,  5.39s/it]

Locustella luscinioides
Locustelle luscinioïde


244it [57:25,  4.12s/it]

Acrocephalus melanopogon
Lusciniole à moustaches


245it [57:26,  3.23s/it]

Acrocephalus schoenobaenus
Phragmite des joncs


246it [57:36,  5.22s/it]

Acrocephalus palustris
Rousserolle verderolle


247it [57:39,  4.72s/it]

Acrocephalus scirpaceus
Rousserolle effarvatte


248it [57:57,  8.76s/it]

Acrocephalus arundinaceus
Rousserolle turdoïde


249it [58:03,  7.87s/it]

Hippolais icterina
Hypolaïs ictérine


250it [58:04,  5.87s/it]

Hippolais polyglotta
Hypolaïs polyglotte


251it [58:13,  6.76s/it]

Sylvia atricapilla
Fauvette à tête noire


252it [58:15,  5.35s/it]

Sylvia borin
Fauvette des jardins


253it [58:17,  4.18s/it]

Sylvia curruca
Fauvette babillarde


254it [58:18,  3.38s/it]

Sylvia hortensis
Fauvette orphée


255it [58:19,  2.67s/it]

Sylvia communis
Fauvette grisette


256it [58:40,  8.16s/it]

Sylvia undata
Fauvette pitchou


257it [58:45,  6.99s/it]

Sylvia cantillans
Fauvette passerinette


258it [58:46,  5.31s/it]

Sylvia melanocephala
Fauvette mélanocéphale


259it [58:51,  5.24s/it]

Phylloscopus inornatus
Pouillot à grands sourcils


260it [58:54,  4.49s/it]

Phylloscopus bonelli
Pouillot de Bonelli


261it [58:57,  4.27s/it]

Phylloscopus sibilatrix
Pouillot siffleur


262it [59:04,  4.82s/it]

Phylloscopus collybita
Pouillot véloce


263it [59:20,  8.27s/it]

Phylloscopus trochilus
Pouillot fitis


264it [59:30,  8.73s/it]

Phylloscopus trochiloides
Pouillot verdâtre


265it [59:31,  6.40s/it]

Regulus regulus
Roitelet huppé


266it [59:47,  9.53s/it]

Regulus ignicapilla
Roitelet à triple bandeau


267it [59:56,  9.14s/it]

Muscicapa striata
Gobemouche gris


268it [59:57,  6.86s/it]

Ficedula parva
Gobemouche nain


269it [59:58,  5.13s/it]

Ficedula albicollis
Gobemouche à collier


270it [59:59,  3.93s/it]

Ficedula hypoleuca
Gobemouche noir


271it [1:00:09,  5.52s/it]

Panurus biarmicus
Panure à moustaches


272it [1:00:10,  4.33s/it]

Aegithalos caudatus
Mésange à longue queue


273it [1:00:21,  6.27s/it]

Poecile palustris
Mésange nonnette


274it [1:00:27,  6.29s/it]

Poecile montanus
Mésange boréale


275it [1:00:29,  4.87s/it]

Lophophanes cristatus
Mésange huppée


276it [1:00:40,  6.71s/it]

Periparus ater
Mésange noire


277it [1:00:57,  9.90s/it]

Cyanistes caeruleus
Mésange bleue


278it [1:01:14, 11.83s/it]

Parus major
Mésange charbonnière


279it [1:01:16,  8.98s/it]

Sitta europaea
Sittelle torchepot


280it [1:01:35, 11.97s/it]

Tichodroma muraria
Tichodrome échelette


281it [1:01:41, 10.34s/it]

Certhia familiaris
Grimpereau des bois


282it [1:01:43,  7.70s/it]

Certhia brachydactyla
Grimpereau des jardins


283it [1:01:56,  9.24s/it]

Remiz pendulinus
Rémiz penduline


284it [1:02:11, 11.05s/it]

Oriolus oriolus
Loriot d'Europe


285it [1:02:17,  9.57s/it]

Lanius collurio
Pie-grièche écorcheur


286it [1:02:19,  7.29s/it]

Lanius minor
Pie-grièche à poitrine rose


287it [1:02:20,  5.47s/it]

Lanius excubitor
Pie-grièche grise


288it [1:02:27,  5.73s/it]

Lanius senator
Pie-grièche à tête rousse


289it [1:02:35,  6.36s/it]

Garrulus glandarius
Geai des chênes


290it [1:02:37,  5.04s/it]

Pica pica
Pie bavarde


291it [1:02:49,  7.36s/it]

Nucifraga caryocatactes
Cassenoix moucheté


292it [1:02:51,  5.74s/it]

Coloeus monedula
Choucas des tours


293it [1:02:56,  5.55s/it]

Corvus frugilegus
Corbeau freux


294it [1:03:19, 10.74s/it]

Corvus corone
Corneille noire


295it [1:03:30, 10.85s/it]

Corvus corone x cornix
Corneille noire x mantelée hybride


296it [1:03:31,  7.83s/it]

Corvus corax
Grand Corbeau


297it [1:10:15, 126.67s/it]

Sturnus vulgaris
Étourneau sansonnet


298it [1:17:18, 215.69s/it]

Pastor roseus
Étourneau roselin


299it [1:18:00, 163.50s/it]

Passer domesticus
Moineau domestique


300it [1:24:39, 234.11s/it]

Passer montanus
Moineau friquet


301it [1:30:41, 272.33s/it]

Petronia petronia
Moineau soulcie


302it [1:32:51, 229.88s/it]

Fringilla coelebs
Pinson des arbres


303it [1:39:29, 280.34s/it]

Fringilla montifringilla
Pinson du Nord


304it [1:46:37, 324.61s/it]

Serinus serinus
Serin cini


305it [1:54:17, 365.09s/it]

Carduelis citrinella
Venturon montagnard


306it [1:55:54, 284.78s/it]

Carduelis chloris
Verdier d'Europe


307it [2:03:57, 344.01s/it]

Carduelis carduelis
Chardonneret élégant


308it [2:11:28, 376.27s/it]

Carduelis spinus
Tarin des aulnes


309it [2:18:39, 392.66s/it]

Carduelis cannabina
Linotte mélodieuse


310it [2:27:17, 430.19s/it]

Carduelis flammea
Sizerin flammé


311it [2:35:53, 456.06s/it]

Loxia curvirostra
Bec-croisé des sapins


312it [2:38:11, 360.70s/it]

Pyrrhula pyrrhula
<urlopen error [WinError 10060] Une tentative de connexion a échoué car le parti connecté n’a pas répondu convenablement au-delà d’une certaine durée ou une connexion établie a échoué car l’hôte de connexion n’a pas répondu>
SAVE ISSUE for species Bouvreuil pivoine and url https://farm1.staticflickr.com/828/42053494852_a167544234_c.jpg
Bouvreuil pivoine


313it [2:48:28, 437.61s/it]

Coccothraustes coccothraustes
Grosbec casse-noyaux


314it [2:49:50, 330.78s/it]

Plectrophenax nivalis
Bruant des neiges


315it [2:54:57, 323.74s/it]

Emberiza citrinella
Bruant jaune


316it [3:05:34, 417.51s/it]

Emberiza melanocephala
Bruant mélanocéphale


317it [3:05:55, 298.76s/it]

Emberiza cirlus
Bruant zizi


318it [3:12:48, 333.08s/it]

Emberiza cia
Bruant fou


319it [3:15:07, 274.78s/it]

Emberiza hortulana
Bruant ortolan


320it [3:16:17, 213.40s/it]

Emberiza pusilla
Bruant nain


321it [3:16:37, 155.21s/it]

Emberiza schoeniclus
Bruant des roseaux


322it [3:21:51, 202.85s/it]

Emberiza calandra
Bruant proyer


323it [3:27:20, 240.69s/it]

Cygnus atratus
Cygne noir


324it [3:33:31, 279.71s/it]

Anser cygnoides
Oie cygnoïde


325it [3:34:59, 222.25s/it]

Aix galericulata
Canard mandarin


326it [3:40:33, 255.78s/it]

Aix sponsa
Canard carolin


327it [3:44:15, 245.64s/it]

Calonetta leucophrys
Calonette à collier noir


328it [3:44:16, 172.22s/it]

Lophodytes cucullatus
Harle couronné


329it [3:49:51, 221.09s/it]

Grus virgo
Grue demoiselle


330it [3:52:58, 210.84s/it]

Pelecanus crispus
Pélican frisé


331it [3:56:27, 210.54s/it]

Pavo cristatus
Paon bleu


332it [4:03:50, 280.04s/it]

Colinus virginianus
Colin de Virginie


333it [4:04:09, 201.70s/it]

Haliaeetus leucocephalus
Pygargue à tête blanche


334it [4:10:48, 261.16s/it]

Streptopelia roseogrisea
Tourterelle rose et gris


335it [4:10:49, 183.06s/it]

Nymphicus hollandicus
Calopsitte élégante


336it [4:11:12, 135.04s/it]

Melopsittacus undulatus
Perruche ondulée


337it [4:11:47, 105.07s/it]

Platycercus elegans
Perruche de Pennant


338it [4:12:06, 79.10s/it] 

Psittacus erithacus
Perroquet jaco ou gris du Gabon


339it [4:12:10, 56.47s/it]

Agapornis fischeri
Inséparable de Fischer


340it [4:12:43, 49.67s/it]

Agapornis roseicollis
Inséparable à face rose


341it [4:12:44, 35.05s/it]

Myiopsitta monachus
Conure veuve


342it [4:14:00, 47.25s/it]

Poicephalus senegalus
Perroquet youyou


343it [4:14:59, 50.87s/it]

Leiothrix lutea
Léiothrix jaune


344it [4:15:29, 44.38s/it]

Corvus splendens
Corbeau familier


345it [4:16:52, 55.97s/it]

Estrilda astrild
Astrild ondulé


346it [4:18:05, 61.22s/it]

Lonchura punctulata
Capucin damier


347it [4:19:02, 60.02s/it]

Lonchura malacca sinensis
Capucin à dos marron


348it [4:19:06, 43.15s/it]

Serinus mozambicus
Serin du Mozambique


349it [4:19:36, 39.06s/it]

Uragus sibiricus
Roselin à longue queue


350it [4:19:37, 27.66s/it]


#debug encoding in case
df_old = pd.read_csv('D:\\vm_exchange\\Flickr\\datasets\\oiseau_du_rhone\\Cygne tuberculé\\flickr_df_Cygne tuberculé.csv',
                     parse_dates=['datetaken'], index_col=False, sep=';', 
                                     engine='python',encoding='utf-8') #ISO-8859-1
print(df_old.shape)
df_old[['title','tags','description']].head(3)
#df_old['description'].iloc[0].decode('ISO-8859-1')

## create one csv file with all metadata info from each species

In [45]:
df_all = pd.DataFrame()
li_df = []
for species in tqdm.tqdm(glob.glob(os.path.join(ROOT_DIR,'datasets', type_,'*'))):
    csv_f = glob.glob(os.path.join(species,'*.csv'))
    if len(csv_f)==1:        
        df = pd.read_csv(csv_f[0], sep=';', index_col=False, parse_dates=['datetaken'],engine='python', encoding=encoding_)
        df['species'] = species.split('\\')[-1]
        li_df.append(df)
        del df
df_all = pd.concat(li_df, ignore_index=True)
df_all.head(3)

100%|████████████████████████████████████████████████████████████████████████████████| 352/352 [00:03<00:00, 91.24it/s]


Unnamed: 0,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,geo_is_family,geo_is_friend,...,place_id,secret,server,species,species_word,tags,title,url_c,width_c,woeid
0,0.0,0.0,2015-03-18 12:27:45,0.0,0.0,b'',8.0,,,,...,,00805be89c,7863.0,Accenteur alpin,Prunella collaris,b'',b'Accenteur alpin - Prunella collaris',https://farm8.staticflickr.com/7863/4672834784...,800.0,
1,0.0,0.0,2019-01-10 15:57:43,0.0,0.0,"b""L'Accenteur mouchet (Prunella modularis) est...",5.0,,,,...,,e628b7c232,4817.0,Accenteur alpin,Prunella collaris,b'accenteurmouchet oiseaux prunellamodularis',b'Accenteur mouchet',https://farm5.staticflickr.com/4817/3288650730...,800.0,
2,13.0,0.0,2018-12-28 10:19:29,0.0,0.0,"b""28 d\xc3\xa9cembre 2018, Troubat (65), il se...",8.0,0.0,0.0,0.0,...,J1y9McJXULKwsbw,689bf813db,7904.0,Accenteur alpin,Prunella collaris,b'accenteuralpin prunellacollaris alpineaccent...,b'Accenteur alpin',https://farm8.staticflickr.com/7904/4646115321...,800.0,628338.0


In [46]:
folder_path = os.path.join(ROOT_DIR,'datasets', type_)
df_all['img_path'] = df_all.apply(lambda x: os.path.join(folder_path,x['species'],x['species']+'_'+str(x['id'])+".png"), 
                                  axis=1)
print(df_all.shape)
df_all.head(3)

(104425, 32)


Unnamed: 0,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,geo_is_family,geo_is_friend,...,secret,server,species,species_word,tags,title,url_c,width_c,woeid,img_path
0,0.0,0.0,2015-03-18 12:27:45,0.0,0.0,b'',8.0,,,,...,00805be89c,7863.0,Accenteur alpin,Prunella collaris,b'',b'Accenteur alpin - Prunella collaris',https://farm8.staticflickr.com/7863/4672834784...,800.0,,D:\vm_exchange\Flickr\datasets\oiseau_du_rhone...
1,0.0,0.0,2019-01-10 15:57:43,0.0,0.0,"b""L'Accenteur mouchet (Prunella modularis) est...",5.0,,,,...,e628b7c232,4817.0,Accenteur alpin,Prunella collaris,b'accenteurmouchet oiseaux prunellamodularis',b'Accenteur mouchet',https://farm5.staticflickr.com/4817/3288650730...,800.0,,D:\vm_exchange\Flickr\datasets\oiseau_du_rhone...
2,13.0,0.0,2018-12-28 10:19:29,0.0,0.0,"b""28 d\xc3\xa9cembre 2018, Troubat (65), il se...",8.0,0.0,0.0,0.0,...,689bf813db,7904.0,Accenteur alpin,Prunella collaris,b'accenteuralpin prunellacollaris alpineaccent...,b'Accenteur alpin',https://farm8.staticflickr.com/7904/4646115321...,800.0,628338.0,D:\vm_exchange\Flickr\datasets\oiseau_du_rhone...


In [47]:
#save metadata info (might not be of same size of number of collected images)
df_all.to_csv(os.path.join(ROOT_DIR,'datasets',type_,'flickr_image_info.csv'),index=False,sep=';')