In [14]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

import concurrent
import itertools
import json
import math
import os
import sys
import urllib

import flickrapi
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
import tqdm

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

import candidate_data
import imagenet
import mturk_data
import mturk_utils
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
imgnet = imagenet.ImageNetData()
cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False)

cds_by_flickr_id = {}
for cand in cds.all_candidates.values():
    cur_flickr_id = cand['id_search_engine']
    if cur_flickr_id not in cds_by_flickr_id:
        cds_by_flickr_id[cur_flickr_id] = []
    cds_by_flickr_id[cur_flickr_id].append(cand)

#mturk = mturk_data.MTurkData(live=True,
#                             load_assignments=True,
#                             source_filenames_to_ignore=mturk_data.main_collection_filenames_to_ignore)

Reading from local file /Users/ludwig/research/deep_learning/imagenet_2/data/cache/metadata/imagenet_metadata_2018-09-14_01-26-58_UTC.pickle ... done
Loaded 167399 unique candidates from 86 search result JSON file(s).
    /Users/ludwig/research/deep_learning/imagenet_2/data/search_results/...
        2018-07-31_flickr_search_result_vaishaal_class_1_153.json
        2018-08-20-16-10-18_becca.json
        2018-08-25-11-43-09_becca.json
        2018-08-27-22-53-45_becca.json
        2018-08-30-02-40-26_becca.json
        2018-08-30-18-46-35_becca.json
        2018-08-30-19-31-10_becca.json
        2018-09-04-17-03-01_becca.json
        2018-09-04-17-36-03_becca.json
        2018-09-05-16-16-14_becca.json
        ...
    There were 65500 duplicate occurences.
    Ignored 0 candidate entries because they are on the blacklist (blacklist size: 1956).


In [3]:
with open('../data/flickr_api_keys.json', 'r') as f:
    flickr_api_keys = json.load(f)
    api_key = flickr_api_keys[0]
    api_secret = flickr_api_keys[1]
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='etree')

In [35]:
search_keys = ['torch -olympic']

cur_wnid = 'n04456115'

max_images = 200

num_duplicate_skipped = 0
good_url_types = ['url_o', 'url_k', 'url_h', 'url_b', 'url_c', 'url_z', 'url_-']
good_url_types.reverse()
result_urls = {}
result_search_keys = {}

for search_key in tqdm.tqdm_notebook(search_keys, desc='API calls'):
    search_set = flickr.walk(
        text=search_key,
        extras = 'date_upload,date_taken,o_dims,url_s,url_q,url_t,url_m,url_n,url_-,url_z,url_c,url_b,url_h,url_k,url_o',
        sort = 'date-posted-asc',
        max_taken_date = '2014-07-11',
        max_uploaded_date = '2014-07-11',
        min_taken_date = '2012-07-11',
        min_uploaded_date = '2012-07-11',
        per_page=1000)

    result_iterator = itertools.islice(search_set, max_images)

    for photo in result_iterator:
        cur_flickr_id = photo.get('id')
        if cur_flickr_id is None:
            print('ERROR: no id returned from flickr')
            continue
        url = None
        for url_type in good_url_types:
            cur_url = photo.get(url_type)
            if cur_url is not None:
                url = cur_url
                selected_url_type = url_type
                break
        if url is None:
            print('ERROR: id {} does not have a good URL'.format(cur_flickr_id))
            continue
        if cur_flickr_id in cds_by_flickr_id:
            if cur_wnid in [x['wnid'] for x in cds_by_flickr_id[cur_flickr_id]]:
                num_duplicate_skipped += 1
                continue
        result_urls[cur_flickr_id] = url
        if cur_flickr_id not in result_search_keys:
            result_search_keys[cur_flickr_id] = []
        result_search_keys[cur_flickr_id].append(search_key)

print('Found {} results, skipped {} duplicates we already have'.format(len(result_urls), num_duplicate_skipped))


def download_image(flickr_id):
    return urllib.request.urlopen(result_urls[flickr_id]).read()

result_images = {}
pbar = tqdm.tqdm_notebook(total=len(result_urls), desc='Downloads')
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    future_to_id = {executor.submit(download_image, flickr_id): flickr_id for flickr_id in result_urls.keys()}
    for future in concurrent.futures.as_completed(future_to_id):
        flickr_id = future_to_id[future]
        try:
            result_images[flickr_id] = future.result()
        except Exception as exc:
            print('Id {} generated an exception: {}'.format(flickr_id, exc))
            raise exc
        pbar.update(1)
pbar.close()

captions = {}
for flickr_id, url in result_urls.items():
    captions[flickr_id] = [url, ', '.join(result_search_keys[flickr_id])]

mturk_utils.show_image_grid(result_urls.keys(), captions, result_images, num_cols=3)

HBox(children=(IntProgress(value=0, description='API calls', max=1), HTML(value='')))


Found 152 results, skipped 48 duplicates we already have


HBox(children=(IntProgress(value=0, description='Downloads', max=152), HTML(value='')))




VBox(children=(HBox(children=(VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x…