In [None]:
import requests
import os
import sys
import time
from flickrapi import FlickrAPI
import pandas as pd

In [None]:
def get_urls(image_tag,MAX_COUNT): -> list
    '''
    Creates a list of urls to find images with a particular tag on Flickr.
    
    Args:
        str image_tag: The tag to search for see Flickr documentation.
        
    Returns: 
        list urls: list of urls of each image
    '''
    key = ''
    secret = ''
    flickr = FlickrAPI(key, secret)
    photos = flickr.walk(text=image_tag,
                            tag_mode='all',
                            tags=image_tag,
                            extras='url_q',
                            per_page=50,
                            sort='relevance')
    count=0
    urls=[]
    for photo in photos:
        if count< MAX_COUNT:
            count += 1
            try:
                url=photo.get('url_q')
                if url == None:
                    raise ValueError
                urls.append(url)
            except:
                pass
        else:
            break
    print(image_tag+" done fetching urls, fetched {} urls out of {}".format(len(urls),MAX_COUNT))
    return urls

In [None]:
def put_images(name, urls): -> None
    '''
    Given the urls, scrapes images off Flickr and puts them in an S3 bucket
    
    Args:
        str name - the root filename for scrapped images
        list urls - the location of the images
    Returns:
        None
    '''
    if ' ' in name:
         name = name.replace(' ','_').replace("\'", "")
    try:       
        os.mkdir(name)
    except:
        pass
    for i, url in enumerate(urls):
        try:
            resp=requests.get(url,stream=True)
            filename = name+'/'+name+str(i)
            outfile=open(filename, 'wb')
            outfile.write(resp.content)
            outfile.close()
        except Exception as e:
            print(i+1, e)
    os.system("aws s3 mv --recursive "+name+" bucket_name")
    os.system("rm --recursive -f "+name)

In [None]:
MAX_COUNT = 2000
names = pd.read_csv('missedconnections.csv', header=None)[0]
names = names.apply(lambda x: x.strip(' \xa0\n'))
names = names.values
for i, name in enumerate(names):
    if ':' not in name:
        t0 = time.time()
        print(i, end = ' ')
        urls = get_urls(name, MAX_COUNT)
        put_images(name, urls)
        t1 = time.time()
        print(name+" done with upload, job took {} seconds".format(t1-t0))