###IMPORTS

In [1]:
import os
from time import time
import requests
import urllib
import numpy as np
import pandas as pd

###CONFIGURE

In [2]:
search_keyword = "apple"
num_of_images = 1000
num_of_batches = np.ceil(np.true_divide(num_of_images,500)) # Flickr API constraint: max per page == 500

# create storage area for data and image files
try:
    os.makedirs(search_keyword)
    for i in xrange(1,int(num_of_batches)+1):
        os.makedirs(search_keyword + "/batch{}".format(i))
except:
    print "Storage area already exists for this keyword."

###QUERY FLICKR PHOTO SEARCH API

In [3]:
for batch in xrange(1,int(num_of_batches)+1):
    print "\n", "-"*20 # output a separator
    
    rest_query = "https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={0}&text={1}&per_page={2}&page={3}&format=json&nojsoncallback=1".format(os.environ['FLICKR_API_KEY'], search_keyword, num_of_images, batch)
    r = requests.get(rest_query)
    print ">"*5, "HTTP request status code:", r.status_code
    # if 200 isn't returned, let user know and break
    if r.status_code != 200:
        print "[An error occured with the HTTP request.]"
        break
    # otherwise
    else:
        results = r.json()
        # output an example
        print "First result from Flickr photo-search result from batch {}:".format(batch)
        print results['photos']['photo'][0]
        
        # this dataframe will store image information
        images_df = pd.DataFrame(columns=['batch', 'index', 'url'])

        # track current image index
        index = 0 

        # track time
        time0 = time()

        # loop through each result photo 
        for photo in results['photos']['photo']:

            # relevant values for creating Flickr image URLs
            # (see: https://www.flickr.com/services/api/misc.urls.html)
            farm_id = photo['farm']
            server_id = photo['server']
            photo_id = photo['id']
            secret = photo['secret']
            size = 'm' # size of image (e.g., 's' [75x75], 'm' [longest side 240])

            # Flickr image URL format
            image_url = 'https://farm{0}.staticflickr.com/{1}/{2}_{3}_{4}.'.format(farm_id, server_id, photo_id, secret, size)

            # try each of the three file-format extensions (the three suggested by Flickr), 
            # breaking after success
            for img_format in ['jpg', 'gif', 'png']:
                r = requests.get(image_url + img_format)
                if r.status_code == 200:
                    image_url += img_format
                    break

            # save image to file
            new_filename = str(index) + image_url[-4:] # filename will be index followed my image format extension
            new_filepath = "{0}/batch{1}/{2}".format(search_keyword, batch, new_filename)
            urllib.urlretrieve(image_url, new_filepath) # download image to appropriate location

            # create new record for dataframe
            new_image_df = pd.DataFrame({'batch':[str(batch)], 'index':[str(index)], 'url':[image_url]})

            # add new record to images dataframe
            images_df = pd.concat([images_df, new_image_df])

            # increment the image index on each iteration
            index += 1

        print "\ndownload time for batch {}:".format(batch), round(time() - time0, 2), "seconds"
        
        # save image data to csv file
        data_filepath = "{0}/batch{1}.csv".format(search_keyword, batch)
        images_df.to_csv(data_filepath, index=False)


--------------------
>>>>> HTTP request status code: 200
First result from Flickr photo-search result from batch 1:
{u'isfamily': 0, u'title': u'IMG_5957.JPG taken with Apple iPhone 5s', u'farm': 1, u'ispublic': 1, u'server': u'754', u'isfriend': 0, u'secret': u'0c8ed3111b', u'owner': u'28110754@N04', u'id': u'21482492709'}

download time for batch 1: 478.01 seconds

--------------------
>>>>> HTTP request status code: 200
First result from Flickr photo-search result from batch 2:
{u'isfamily': 0, u'title': u'half the batter, half the apples', u'farm': 6, u'ispublic': 1, u'server': u'5705', u'isfriend': 0, u'secret': u'40b9803837', u'owner': u'12842940@N00', u'id': u'21646885562'}

download time for batch 2: 460.62 seconds


###SANITY CHECK

In [10]:
files = os.listdir("{0}/batch1/".format(search_keyword))
print "first 5 files:"
print files[:5] # sorted by OS (naming) convention

print "\nlast 5 files:"
print files[-5:] # sorted by OS (naming) convention

print "\ntotal number of files:", len(files)

first 5 files:
['0.jpg', '1.jpg', '10.jpg', '100.jpg', '101.jpg']

last 5 files:
['95.jpg', '96.jpg', '97.jpg', '98.jpg', '99.jpg']

total number of files: 500


In [11]:
data_filepath = "{0}/batch2.csv".format(search_keyword)

In [12]:
# check top records
pd.read_csv(data_filepath).head()

Unnamed: 0,batch,index,url
0,2,0,https://farm6.staticflickr.com/5705/2164688556...
1,2,1,https://farm1.staticflickr.com/741/21471442649...
2,2,2,https://farm1.staticflickr.com/619/21035602304...
3,2,3,https://farm6.staticflickr.com/5747/2164688488...
4,2,4,https://farm6.staticflickr.com/5829/2165825668...


In [13]:
# check bottom records
pd.read_csv(data_filepath).tail()

Unnamed: 0,batch,index,url
495,2,495,https://farm6.staticflickr.com/5830/2146023336...
496,2,496,https://farm6.staticflickr.com/5802/2146022960...
497,2,497,https://farm1.staticflickr.com/709/21635703572...
498,2,498,https://farm6.staticflickr.com/5790/2145910376...
499,2,499,https://farm1.staticflickr.com/767/21635429412...
