In [1]:
#basic package
import json
import tqdm
import os
import numpy
import glob
import pandas as pd
import sys
import shutil

#to set connection with Flickr API
from flickrapi import FlickrAPI

#image
from PIL import Image

#url open to get image
import urllib.request
from urllib.request import urlopen

#date
import datetime as dt
from datetime import datetime

#plot (for image verification)
import cv2
import matplotlib.cm as cm
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../")
path_data = os.path.join(ROOT_DIR,'snakes_data/SNAPP_images')
path_ml = os.path.join(ROOT_DIR,'SNAKES')
folder_path = os.path.join(path_ml,'datasets','flickr_images')

## API parameters

In [3]:
#to get these data fllow the direction of: http://joequery.me/code/flickr-api-image-search-python/
FLICKR_PUBLIC = 'dd0cb0ced4e83452f8d49cb3d534707d'
FLICKR_SECRET = '4c568d2002b5506e'

In [4]:
flickr = FlickrAPI(FLICKR_PUBLIC, FLICKR_SECRET, format='parsed-json')
extras = 'description,geo,tags,url_c,owner_name,date_taken,license'

In [None]:
#create appropriate folder if not existing
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

## Download images from Flickr

In [6]:
#in case to erase all
#shutil.rmtree(os.path.join(path_ml,'datasets','flickr_images'))

In [7]:
#TODO:
#look into more details of the 'safe_search' parameter

In [8]:
#INFO:
#for more parameter options: https://www.flickr.com/services/api/flickr.photos.search.html
#tags (Optional): A comma-delimited list of tags. Photos with one or more (or all tags by changing tags_mode)of the 
#tags listed will be returned. You can exclude results that match a term by prepending it with a - character.
#http://joequery.me/code/flickr-api-image-search-python/
#lisence info: https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html
#geolocalisation should not be used, as for example picture might be taken from a museum. we should add geolocalisation 
#based on 'biology' knowledge

In [9]:
#produce a list of species to search for
li_species = []
for genus in glob.glob(os.path.join(path_ml,'datasets','snapp_images','*')):
    for species in glob.glob(os.path.join(genus,'*')):
        li_species.append(species.split('/')[-1])
print('We have in total %d species'%len(li_species))

We have in total 3700 species


In [10]:
#create appropriate folder if needed
for species in li_species:
    folder_path_s = os.path.join(folder_path,species)
    if not os.path.exists(folder_path_s):
        os.makedirs(folder_path_s)

In [11]:
#li_species.reverse()

In [None]:
#download images form flickr
#Go in each species folder, and downlaod all the photos with a taken date greater than or equal to the maximum one 
#recorded in the species-metadata file if it exist, otherwise download it from the begining ("0000-00-00 00:00:00")
#While downloading an image, if there is an error from flickr stop the code (might be connection error). Then you 
#simply need to rerun it perhaps few minutes later

for species in tqdm.tqdm(li_species):
                    
    #initialization
    t = "0000-00-00 00:00:00"
    df_old = pd.DataFrame()
    folder_path_s = os.path.join(folder_path,species)

    #if the new collection of images is empty, then stop it, otherwise continue with the last taken date
    while True:

        #open the existing metadata file if any, and use the max taken date to grab data from that point instead
        old_meta_data_file = os.path.join(folder_path_s,'flickr_df_'+species+'.csv')
        if len(glob.glob(old_meta_data_file))>0:
            #date will be: Timestamp('2017-04-30 17:12:52')
            df_old = pd.read_csv(old_meta_data_file, parse_dates=['datetaken'], index_col=False, sep=';')
            #we need to take minus one day as min_taken_date is apparently working at day level, and before saving
            #we'll need to remove possibly duplicates (might happen if several picture taken the same day but we stop
            #at a "middle picture of the day"). Also we convert to good format for flickr query
            t = (max(df_old['datetaken'].tolist()) - dt.timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
        #print()#to debug
        #print(t)#to debug
        #print('lets collect %s'%species)#to debug
        try:
            image_data = flickr.photos.search(text='\"'+species+'\"', content_type=1, media="photos", 
                                              per_page=500, extras=extras, min_taken_date=t)
        except KeyboardInterrupt:
            raise
        except Exception as e:
            print(e)
            print('Not able to DOWNLOAD flickr image for species %s'%species)
            #TODO: investigate how to avoid these exception
            sys.exit()

        #download image with its url and save it
        for i, photo in enumerate(image_data['photos']['photo']): #besides photos there is only a 'stat' key
            if 'url_c' in photo:
                url = photo['url_c']
                img = Image.open(urlopen(url))
                try:
                    img.save(os.path.join(folder_path_s,species+'_'+photo['id']+".png"))
                except KeyboardInterrupt:
                    raise
                except Exception as e:
                    print(e)
                    print('Not able to SAVE flickr image for species %s and url %s'%(species,str(url)))
                del img

        #create new metadata file with all the images (old and new)
        df_new = pd.DataFrame(image_data['photos']['photo'])
        df = pd.concat([df_old, df_new], ignore_index=True)

        #save and remove duplicates (before: uniform the id type(as when we save and open the str get converted to int))
        if df.shape[0]>0:
            df['id'] = df['id'].astype(int) 
            df = df.drop_duplicates(subset=['id'], keep='first', inplace=False)
            #save metadata for each images of this species (note: might be empty if no images was collected)
            df.to_csv(os.path.join(folder_path_s,'flickr_df_'+species+'.csv'), index=False, sep=';')

        #print(df_old.shape,df_new.shape,df.shape) #to debug
        #if there was already data collected and the new one brought some more data
        if (df_old.shape[0]>0) and (df.shape[0]>df_old.shape[0]):
            print('species %s needed two collected data'%species)
        #if no more data was bring last time
        if df.shape[0]==df_old.shape[0]: #wrong: df_new.shape[0]==0: indeed we can gather images that are already (-1d)
            break

#do_request: Status code 502 received
#('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
#do_request: Status code 500 received

In [None]:
#species Ithycyphus miniatus needed two collected data
#species Ithycyphus oursi needed two collected data

## Check amount of flickr images we have

In [22]:
#keeping meta data of only the images we truely have 
#look at the actual image we really have
li_flickr_images = []
for species in glob.glob(os.path.join(path_ml,'datasets','flickr_images','*')):
    li_flickr_images.extend([x for x in glob.glob(os.path.join(species,'*')) if not x.endswith('.csv')])
len(li_flickr_images)
print('We have %d images collected from Flickr'%len(li_flickr_images))

We have 35777 images collected from Flickr


## create one csv file with all metadata info from each species

In [None]:
df_all = pd.DataFrame()
li_df = []
for species in tqdm.tqdm(glob.glob(os.path.join(path_ml,'datasets','flickr_images','*'))):
    csv_f = glob.glob(os.path.join(species,'*.csv'))
    if len(csv_f)==1:
        df = pd.read_csv(csv_f[0], sep=';', index_col=False)
        df['species'] = species.split('/')[-1]
        li_df.append(df)
        del df
df_all = pd.concat(li_df, ignore_index=True)
df_all.head(3)

In [24]:
df_all['img_path'] = df_all.apply(lambda x: os.path.join(folder_path,x['species'], 
                                                         x['species']+'_'+str(x['id'])+".png"), axis=1)
print(df_all.shape)
#keeping only the images we truely have 
df_all = df_all[df_all['img_path'].isin(li_flickr_images)]
print(df_all.shape)
df_all.head(3)

(49977, 31)
(35777, 31)


Unnamed: 0,accuracy,context,datetaken,datetakengranularity,datetakenunknown,description,farm,geo_is_contact,geo_is_family,geo_is_friend,...,place_id,secret,server,species,tags,title,url_c,width_c,woeid,img_path
0,0,0,2018-10-04 03:30:32,0,1,{'_content': ''},2,,,,...,,a8bfa9d292,1922,Acanthophis praelongus,,Northern death adder from Cairns. Acanthophis ...,https://farm2.staticflickr.com/1922/4331268958...,800.0,,/mount/SDB/camille-secure/SNAKES/datasets/flic...
1,0,0,2018-10-04 03:30:43,0,1,{'_content': ''},2,,,,...,,5b032f28ec,1954,Acanthophis praelongus,,Northern death adder from Cairns. Acanthophis ...,https://farm2.staticflickr.com/1954/3125232722...,800.0,,/mount/SDB/camille-secure/SNAKES/datasets/flic...
2,0,0,2018-10-03 19:39:42,0,0,{'_content': 'Mt Molloy - Far North Queensland...,2,,,,...,,9893fd3e82,1925,Acanthophis praelongus,northern death adder acanthophis praelongus sh...,Northern Death Adder (Acanthophis praelongus),https://farm2.staticflickr.com/1925/4326216653...,800.0,,/mount/SDB/camille-secure/SNAKES/datasets/flic...


In [25]:
#save metadata info (might not be of same size of number of collected images)
df_all.to_csv(os.path.join(folder_path,'flickr_image_info.csv'),index=False,sep=';')