In [1]:
from cnns.utils import imagenet_utils as imu
from cnns.utils import training_utils as tu
from cnns.utils import cnn_utils as cu
import pandas as pd
import pickle as pkl

In [2]:
%load_ext autoreload
%autoreload 2

In [18]:
cur = tu.start_psycon()

Creating true test sets

In [28]:
def generate_random_imagenet_set(cur, num_samples=10000):
    # look into faster ways of doing this
    query = 'SELECT iiu.img_url, iiu.internal_category \
             FROM imagenet_image_urls iiu \
             ORDER BY random() \
             LIMIT %i'%num_samples;
    records = tu.run_query(cur, query)
    df = pd.DataFrame(records, columns=['img_url', 'category'])
    return df
    
def collect_imagenet_test_set(cur):
    query = 'with full_and_val_set as (SELECT iiu.* \
    FROM imagenet_image_urls iiu \
    LEFT JOIN imagenet_validation_image_urls ivu ON iiu.img_url = ivu.img_url) \
    SELECT fvs.img_url, fvs.imagenet_category, fvs.internal_category \
    FROM full_and_val_set fvs \
    LEFT JOIN imagenet_training_image_urls itu \
    ON fvs.img_url = itu.img_url \
    WHERE itu.img_url IS NULL';
    records = tu.run_query(cur, query)
    
    df = pd.DataFrame(records, columns=['img_url', 'imagenet_category', 'category'])
    return df

def collect_everalbum_test_set(cur):
    query = "SELECT * FROM everalbum_image_urls"
    records = tu.run_query(cur, query)
    auth_tokens = {}
    data = []
    for record in records:
        user_id, mem_id, user_id, category = record
        if user_id not in auth_tokens:
            auth_tokens[user_id] = cu.get_auth_token(user_id)
        data.append({'img_url':'http://download-dot-maestro-prod.appspot.com/%i?auth_token=%s'
                        % (mem_id, auth_tokens[user_id]),
                     'category': category})
    
    df = pd.DataFrame(data)
    return df

In [23]:
imagenet_random_sample_df = generate_random_imagenet_set(cur, num_samples=100000)

In [25]:
imagenet_random_sample_df.to_csv('/Users/babasarala/Desktop/imagenet_random_sample.csv')

In [29]:
imagenet_df = collect_imagenet_test_set(cur) # set of images NOT used for training

In [30]:
imagenet_df

Unnamed: 0,img_url,imagenet_category,category
0,http://farm2.static.flickr.com/1213/1271491808...,"Pekinese, Pekingese, Peke",furry_friends
1,http://farm2.static.flickr.com/1331/1050310100...,"Pekinese, Pekingese, Peke",furry_friends
2,http://www.global-b2b-network.com/direct/dbima...,"motor scooter, scooter",other
3,http://wangye.win.mofcom.gov.cn/www/8/wangye/i...,"motor scooter, scooter",other
4,http://farm1.static.flickr.com/189/472483910_9...,"Lhasa, Lhasa apso",furry_friends
5,http://www.tanimodimotoclub.it/foto-moto/foto-...,neck brace,other
6,http://www.auto-motor.at/Motorrad/Motorraeder-...,neck brace,other
7,http://www.logosfoundation.org/backgrounds/lib...,library,other
8,http://static.flickr.com/18/69786405_eb76e46db...,library,other
9,http://farm1.static.flickr.com/155/345985393_b...,library,other


In [8]:
imagenet_df['category'].value_counts()

other            126235
wildlife          72019
furry_friends     49638
food               6186
devices            5363
documents          1098
screenshots         532
dtype: int64

In [14]:
curr_model = pkl.load(open('/Users/babasarala/repos/cnns/models/2016-08-16 14_26_08_L1_LR.p'))

In [10]:
custom_model_data = curr_model['complete_data'][['img_url', 'category']]

In [13]:
custom_model_data['category'].value_counts()

other          53265
food           14338
documents       1373
whiteboards      168
sketches          69
dtype: int64

In [12]:
merged_set = pd.merge(imagenet_df, custom_model_data, on='img_url')
merged_set = merged_set.rename(columns={'category_x':'category'})x
merged_set['category'].value_counts()

other        10963
food          3811
documents     1080
dtype: int64

In [15]:
merged_set.to_csv('/Users/babasarala/Desktop/settings_v1.7_test_set.csv')