In [1]:
# Scrapes Flickr for pictures or videos

import flickrapi, json, urllib, os, pandas as pd, time
from multiprocessing.dummy import Pool as ThreadPool

config = json.load(open('config/config.json')) # Make sure to git ignore
API_KEY = config['flickr_api_key']
API_SECRET = config['flickr_api_secret']

# Initialize flickr session
flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, cache = False, format='parsed-json')

In [None]:
# Done separately because flickr doesn't support extras for video
# Looks for pictures that have been geocoded
def search_photo(keyword,latitude,longitude,distance):
    obj = flickr.photos.search(text=keyword,tags=keyword,has_geo='1',lat=latitude,lon=longitude,radius=distance,
                               radius_units='mi',sort="relevance",extras='url_o,date_taken',
                               media='photo',license='1,2,4,5', min_upload_date='2017-01-01',per_page=50) #'photo' or 'videos'
    return obj

# Return lat and lon of photo
def get_photo_loc(p_id):
    obj = flickr.photos.geo.getLocation(photo_id=p_id)
    return obj['photo']['location']['longitude'], obj['photo']['location']['latitude']

# Make directory to store data
def make_dir(keyword):
    if not os.path.exists(keyword):
        os.makedirs(keyword)

# Return user info
def get_user(u_id):
    obj = flickr.people.getInfo(user_id=u_id)
    return obj['person']['profileurl']['_content'], obj['person']['username']['_content']

# Delete extranous columns
def del_column(df):
    columns_to_delete = ['height_o','farm','isfamily','isfriend','ispublic','secret','server','width_o', 'datetakengranularity','datetakenunknown']      
    for col in columns_to_delete:
        try:
            del df[col]
        except:
            print("{0} doesn't exist".format(col))
            
# Augment json with additional info            
def augment(x):
    x['profile_url'], x['username'] = get_user(x.get('owner'))
    x['longitude'], x['latitude'] = get_photo_loc(x.get('id'))
    
# Download images
def download_image(x):
    try:
        urllib.urlretrieve(x['url_o'], '{0}/{1}.jpg'.format(x['keyword'],x['id']))
        return('success')
    except:           
        return('failed to download {0}.jpg'.format(x['id']))
    
# Orchestrates workers
def orchestrate(x):
    del_column(x)
    augment(x)
    download_image(x)

In [None]:
# Run process here
# Input keyword, latitude, longitude, and radius
word = 'dog' # Can't be None
lat =  None # Can be None
lon = None # Can be None
r = None # Can be None

# Start timer
start = time.time() 

# Make directory (if needed)
search_results = make_dir(word)

# Search results
search_results = search_photo(word, latitude = lat, longitude = lon, distance = r)

# Create list from dict to feed into parallel processing
z = []
for s in search_results['photos']['photo']:
    s['keyword'] = word
    z.append(s)
    
# Orchestrate
start = time.time()
pool = ThreadPool(13)
results = pool.map(orchestrate, z)
print('Time to complete: {0}'.format(round(time.time()-start,2)))
pool.close()
pool.join()
    
df = pd.DataFrame(z)
df.to_csv('{0}/{1}.csv'.format(word,word),index=False, encoding='utf-8')