In [1]:
import torch
import time
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import pickle

## Combining base indices for pruning


In [3]:
dir_main = '' # where we save all main paths
dir_base_eval_refs = ''  # where we saved path chunks of datapoints to prune
eval_datasets = ['imagenet-a', 'imagenet-r', 'imagenet-v2', 'objectnet-subsample', 'imagenet-sketch', 'imagenet-val']


In [6]:
paths_200m = np.load(dir_main+'paths_200m.npy')
paths = {}
sims = {}
labels = {}

In [7]:
# combine all paths, sims, labels for datapoints to prune for each eval dataset

for ed in eval_datasets:
    start_time = time.time()
    paths[ed] = []
    sims[ed] = []
    labels[ed] = []
    #ids_violation[ed] = []
    # load all
    for i in range(200):
        paths[ed].append(np.load(dir_base_eval_refs+'laion_'+ed+'/sims_per_query/overall_nn_paths_'+str(i)+'.npy'))
        sims_temp = np.load(dir_base_eval_refs+'laion_'+ed+'/sims_per_query/overall_nn_sims_'+str(i)+'.npy')
        sims[ed].append(sims_temp)
        labels[ed].append(np.load(dir_base_eval_refs+'laion_'+ed+'/sims_per_query/overall_nn_labels_'+str(i)+'.npy'))
    
    # concatenate and sort
    paths[ed] = np.concatenate(paths[ed])
    sims[ed] = np.concatenate(sims[ed])
    labels[ed] = np.concatenate(labels[ed])

    idcs_sorted = np.argsort(paths[ed])
    paths[ed] = paths[ed][idcs_sorted]
    sims[ed] = sims[ed][idcs_sorted]
    print(f"{ed} done in {time.time()-start_time}s")


imagenet-a done in 8.490407228469849s
imagenet-r done in 4.347375869750977s
imagenet-v2 done in 5.291475534439087s
objectnet-subsample done in 6.13672399520874s
imagenet-sketch done in 6.266106128692627s
imagenet-val done in 2.8996026515960693s


In [9]:
for ed in eval_datasets:
    print(f"{ed}, size = {len(paths[ed])}")

imagenet-a, size = 138852
imagenet-r, size = 5735749
imagenet-v2, size = 274325
objectnet-subsample, size = 266025
imagenet-sketch, size = 8342783
imagenet-val, size = 377340


In [10]:
# save overall paths/sims/labels
for ed in eval_datasets:
    np.save(dir_base_eval_refs+'laion_'+ed+'/sims_per_query/overall_nn_paths.npy', paths[ed])
    np.save(dir_base_eval_refs+'laion_'+ed+'/sims_per_query/overall_nn_sims.npy', sims[ed])
    np.save(dir_base_eval_refs+'laion_'+ed+'/sims_per_query/overall_nn_labels.npy', labels[ed])

## Generate all pruned dataset indices

In [None]:
set_paths_200m = set(paths_200m)
dict_names = {'imagenet-a': 'a', 'imagenet-r': 'r', 'imagenet-v2': 'v2', 'objectnet-subsample': 'objectnet',
              'imagenet-sketch':'sketch', 'imagenet-val':'val'}

In [None]:
for ed in eval_datasets:
    paths_lo = np.array(list(set_paths_200m - set(paths[ed])))
    paths_lo = np.sort(paths_lo)
    assert print(len(paths[ed])+len(paths_lo) == len(paths_200m))
    np.save(dir_main+'paths_pruned_'+dict_names[ed]+'_per_query_200m.npy', paths_lo)

## Generate combined pruned dataset indices

In [None]:
# get the combined set of all paths to prune

set_combined = set()
for ed in eval_datasets:
    set_combined = set_combined | set(paths[ed])

In [None]:
# sort and get it as an array
paths_comb = np.array(list(set_combined))
paths_comb = np.sort(np.unique(paths_comb))

In [None]:
# Save combined pruned paths paths
paths_lo_combined = np.sort(np.array(list(set_paths_200m - set_combined)))
np.save(dir_main+'paths_pruned_combined_per_query.npy', paths_lo_combined)

## Throwing away random datapoints

In [None]:
sizes = {'175m':175000000, '150m':150000000, '125m':125000000, '100m':100000000, '75m':75000000, '50m':50000000}
for size in sizes.keys():
    idcs_lo_rand = random.sample(range(len(paths_200m)), sizes[size])
    idcs_lo_rand = np.sort(np.unique(idcs_lo_rand))
    print(len(idcs_lo_rand))
    np.save(dir_main+'paths_pruned_rand_'+size+'.npy', paths_200m[idcs_lo_rand])

## Throwing away FAR points and NEAR points

In [None]:
dir_max_laion = ''  # directory where max sims for each laion200m chunk and eval dataset is stored

sims_max  = {}
idcs_max  = {}
for ed in eval_datasets:
    sims_max[ed] = np.load(dir_max_laion+ed+'/sims_all.npy')
    idcs_max[ed] = np.load(dir_max_laion+ed+'/idcs_all.npy')
    
    idcs_sorted = np.argsort(sims_max[ed])
    sims_max[ed] = sims_max[ed][idcs_sorted]
    idcs_max[ed] = idcs_max[ed][idcs_sorted]

In [None]:
# get all near and far pruned paths

sizes = {'175m':175000000, '150m':150000000, '125m':125000000, '100m':100000000, '75m':75000000, '50m':50000000, '40m':40000000, '25m':25000000}

idcs_far = {}
idcs_near = {}

for ed in eval_datasets:
    idcs_far[ed] = {}
    idcs_near[ed] = {}
    for key in sizes.keys():
        idcs_sorted_reverse = idcs_max[ed][::-1]
        idcs_far[ed][key] = idcs_sorted_reverse[:sizes[key]]
        idcs_near[ed][key] = idcs_max[ed][:sizes[key]]
        
        np.save(dir_main+'paths_pruned_far_'+dict_names[ed]+'_'+key+'.npy', idcs_far[ed][key])
        np.save(dir_main+'paths_pruned_near_'+dict_names[ed]+'_'+key+'.npy', idcs_near[ed][key])