###IMPORTS

In [1]:
import os
from time import time
import requests
import urllib
import numpy as np
import pandas as pd

###CONFIGURE

In [2]:
search_keyword = "apple"
num_of_images = 1000
num_of_batches = np.ceil(np.true_divide(num_of_images,500)) # Flickr API constraint: max per page == 500

# create storage area for data and image files
try:
    os.makedirs(search_keyword)
    for i in xrange(1,int(num_of_batches)+1):
        os.makedirs(search_keyword + "/batch{}".format(i))
except:
    print "Storage area already exists for this keyword."

###QUERY FLICKR PHOTO SEARCH API

In [3]:
for batch in xrange(1,int(num_of_batches)+1):
    print "\n", "-"*20 # output a separator
    
    rest_query = "https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={0}&text={1}&per_page={2}&page={3}&format=json&nojsoncallback=1".format(os.environ['FLICKR_API_KEY'], search_keyword, num_of_images, batch)
    r = requests.get(rest_query)
    print "\tHTTP request status code:", r.status_code
    # if 200 isn't returned, let user know and break
    if r.status_code != 200:
        print "[An error occured with the HTTP request.]"
        break
    # otherwise
    else:
        results = r.json()
        # output an example
        print "First result from Flickr photo-search result from batch {}:".format(batch)
        print results['photos']['photo'][0]
        
        # this dataframe will store image information
        images_df = pd.DataFrame(columns=['batch', 'index', 'url'])

        # track current image index
        index = 0 

        # track time
        time0 = time()

        # loop through each result photo 
        for photo in results['photos']['photo']:

            # relevant values for creating Flickr image URLs
            # (see: https://www.flickr.com/services/api/misc.urls.html)
            farm_id = photo['farm']
            server_id = photo['server']
            photo_id = photo['id']
            secret = photo['secret']
            size = 'm' # size of image (e.g., 's' [75x75], 'm' [longest side 240])

            # Flickr image URL format
            image_url = 'https://farm{0}.staticflickr.com/{1}/{2}_{3}_{4}.'.format(farm_id, server_id, photo_id, secret, size)

            # try each of the three file-format extensions (the three suggested by Flickr), 
            # breaking after success
            for img_format in ['jpg', 'gif', 'png']:
                r = requests.get(image_url + img_format)
                if r.status_code == 200:
                    image_url += img_format
                    break

            # save image to file
            new_filename = str(index) + image_url[-4:] # filename will be index followed my image format extension
            new_filepath = "{0}/batch{1}/{2}".format(search_keyword, batch, new_filename)
            urllib.urlretrieve(image_url, new_filepath) # download image to appropriate location

            # create new record for dataframe
            new_image_df = pd.DataFrame({'batch':[batch], 'index':[index], 'url':[image_url]})

            # add new record to images dataframe
            images_df = pd.concat([images_df, new_image_df])

            # increment the image index on each iteration
            index += 1

        print "\ndownload time for batch {}:".format(batch), round(time() - time0, 2), "seconds"
        
        # save image data to csv file
        data_filepath = "{0}/batch{1}/image_data.csv".format(search_keyword, batch)
        images_df.to_csv(data_filepath, index=False)

HTTP request status code: 200
First result from Flickr photo-search result from batch 1:
{u'isfamily': 0, u'title': u'Time to start getting psyched for Halloween! Here to kick of Spooky Season is our Poison Apple cr\xeape. Crisp fall apples, Ghirardelli caramel drizzle, topped with walnuts and lightly sprinkled with sea salt. \U0001f480\U0001f341\U0001f47b #boo #thecrepery #log', u'farm': 6, u'ispublic': 1, u'server': u'5726', u'isfriend': 0, u'secret': u'b68a30c066', u'owner': u'90074569@N03', u'id': u'21045332644'}
download time for batch 1: 497.43 seconds
HTTP request status code: 200
First result from Flickr photo-search result from batch 2:
{u'isfamily': 0, u'title': u'Red Wine', u'farm': 6, u'ispublic': 1, u'server': u'5753', u'isfriend': 0, u'secret': u'7294fbd185', u'owner': u'7992704@N05', u'id': u'21469578298'}
download time for batch 2: 527.37 seconds


###SANITY CHECK

In [10]:
data_filepath = "{0}/batch1/image_data.csv".format(search_keyword)

In [11]:
# check top records
pd.read_csv(data_filepath).head()

Unnamed: 0,batch,index,url
0,1,0,https://farm6.staticflickr.com/5726/2104533264...
1,1,1,https://farm1.staticflickr.com/724/21480996569...
2,1,2,https://farm6.staticflickr.com/5752/2148099256...
3,1,3,https://farm1.staticflickr.com/591/21667710475...
4,1,4,https://farm6.staticflickr.com/5755/2164158936...


In [12]:
# check bottom records
pd.read_csv(data_filepath).tail()

Unnamed: 0,batch,index,url
495,1,495,https://farm6.staticflickr.com/5763/2103626095...
496,1,496,https://farm6.staticflickr.com/5832/2146943165...
497,1,497,https://farm1.staticflickr.com/769/21474936548...
498,1,498,https://farm6.staticflickr.com/5717/2147594957...
499,1,499,https://farm1.staticflickr.com/582/21651267422...
