In [None]:
print("starting C_get_labels_of_natural_reference_and_default_images")

This notebook saves ut.n_batches of ut.n_bins (== batch-size) most strongly activating images as well as the default ImageNet images for the occlusion stimuli on share.

Large parts of this notebook are taken from https://github.com/bethgelab/testing_visualizations/blob/master/generate_stimuli/save_natural_stimuli.py on 14.04.2020.

# Imports

In [None]:
import pandas as pd
import numpy as np
import os
import csv
import ast
import time
from torch.utils import data
from tqdm.auto import tqdm
from torchvision.datasets.folder import default_loader as default_image_loader
from torchvision import transforms
import shutil

In [None]:
import argparse

In [None]:
import occlusion_utils as ut

# Parameters

In [None]:
csv_max_filename = "activation_max.csv"

In [None]:
raw_args = "-s=$PATHTOGENERATEDSTIMULI/stimuli_pure_conditions -t sampled_trials"

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--stimuli-dir", required=True, help="Path to save stimuli to.")
parser.add_argument("-t", "--trial-type", required=True, help="instruction_practice_catch or sampled_trials.")
args = parser.parse_args(raw_args.split(" "))
print(args)

In [None]:
stimuli_dir = args.stimuli_dir
trial_type = args.trial_type
n_reference_images, n_bins, _ = ut.get_number_of_stimuli(stimuli_dir)

# Load units

In [None]:
# read in units (layer, kernel_size, feature map) of interest
# this is the list of units we used in the MTurk CFV experiment
from io import StringIO
csv_data = StringIO("""
#
layer_number,kernel_size_number,channel_number,layer_name,pre_post_relu,kernel_size,feature_map_number
0,1,0,mixed3a,pre_relu,3x3,189
1,1,0,mixed3b,pre_relu,3x3,178
2,1,0,mixed4a,pre_relu,3x3,257
3,1,0,mixed4b,pre_relu,3x3,339
4,1,0,mixed4c,pre_relu,3x3,247
5,1,0,mixed4d,pre_relu,3x3,342
6,1,0,mixed4e,pre_relu,3x3,524
7,1,0,mixed5a,pre_relu,3x3,278
8,1,0,mixed5b,pre_relu,3x3,684
0,3,0,mixed3a,pre_relu,pool,227
1,3,0,mixed3b,pre_relu,pool,430
2,3,0,mixed4a,pre_relu,pool,486
3,3,0,mixed4b,pre_relu,pool,491
4,3,0,mixed4c,pre_relu,pool,496
5,3,0,mixed4d,pre_relu,pool,483
6,3,0,mixed4e,pre_relu,pool,816
7,3,0,mixed5a,pre_relu,pool,743
8,3,0,mixed5b,pre_relu,pool,1007
""")
unit_specs_df = pd.read_csv(csv_data, header=1)

# Helper code

In [None]:
def get_randomized_indices_except_for_last_bin(n_bins, n_batches, seed):
    """generate randomized order of indices except for the last bin.
    This means that for all but the last bin the 20 images that belong to one 
    bin (e.g. min_0.png) is different:
    randomize(0...19), then randomize(20...39) ... randomize(160...179).
    The last bin stays in order: (180 181 182 ... 199).
    Use this function for between-subject designs!!!
    """
    randomized_indices = np.empty([n_bins * n_batches])
    array_of_n_batches = np.arange(n_batches)

    np.random.seed(seed)
    for bin_i in range(n_bins):
        # keep the order of the largest indices in order
        if bin_i == n_bins - 1:
            randomized_indices[
                (n_batches * bin_i) : (n_batches + bin_i * n_batches)
            ] = array_of_n_batches + bin_i * n_batches
        # permute the order of all but the largest indices
        else:
            randomized_indices[
                (n_batches * bin_i) : (n_batches + bin_i * n_batches)
            ] = np.random.permutation(array_of_n_batches + bin_i * n_batches)

    return randomized_indices

# Copy images over

In [None]:
layer_names_number_dict = {
    'mixed3a': 0,
    'mixed3b': 1,
    'mixed4a': 2,
    'mixed4b': 3,
    'mixed4c': 4,
    'mixed4d': 5,
    'mixed4e': 6,
    'mixed5a': 7,
    'mixed5b': 8
}

In [None]:
all_image_paths = []

# iterate over dataframe
for _, row in tqdm(unit_specs_df.iterrows()):
    start = time.time()
    kernel_size = row["kernel_size_number"]
    channel_number = row["channel_number"]
    feature_map = row["feature_map_number"]
    layer_number = row["layer_number"]
    layer_name = row["layer_name"]
    print(f"layer_name {layer_name}, feature_map {feature_map}, channel {channel_number}, kernel_size {kernel_size}")
    
    # folder where csv with activations for all 50,000 images is stored
    csv_filename = f"$DATAPATH/all_activations_imagenet_train/channel/sampled_trials/layer_{layer_names_number_dict[layer_name]}/activations_whole_dataset.pkl"
    
    # load this csv into df (takes long!)
    print("Loading df")
    # df = pd.read_csv(csv_filename, header=1, converters={'activation': ast.literal_eval})
    df = pd.read_pickle(csv_filename)
    
    print("df loaded")
    
    # select relevant feature map and sort in descending order
    df_expanded = df.copy()
    df_expanded["selected_activation"] = df["activation"].apply(lambda x: x[feature_map])
    df_expanded_sorted = df_expanded.sort_values("selected_activation", ascending=True)
    
    # create dataframes with relevant columns and rows only. Also, randomize the order in one image bin
    assert n_bins <= 10, "n_bins is upper bounded by 10"

    # generate indices always under the assumption that we need 10 bins
    max_indices = get_randomized_indices_except_for_last_bin(
        10, ut.n_batches, seed=feature_map + 1
    )
    # then only use last n_bins bins and shift their index values
    max_indices = max_indices[-n_bins*ut.n_batches:] - ((10 - n_bins) * ut.n_batches)
    max_images_activations_df = (
        df_expanded_sorted[-n_bins * ut.n_batches :]
        .drop(["activation", "target class"], axis=1)
        .iloc[max_indices]
    )
    
    # load dataset
    max_file_names = max_images_activations_df["path to image"].tolist()
    
    image_paths = [{'references': []} for _ in range(ut.n_batches)]
    
    for image_idx_in_batch in range(n_bins):
        for batch in range(ut.n_batches):
            image_idx = batch + ut.n_batches * image_idx_in_batch


            # save image
            max_image_path = max_file_names[image_idx]
            
            if image_idx_in_batch == n_reference_images:
                # query image
                image_paths[batch]["query_image_path"] = max_image_path          
            else:
                image_paths[batch]["references"].append(max_image_path)
           
                
    all_image_paths.append(image_paths)
    end = time.time()
    print(f"       time for one layer: {end-start}")

In [None]:
image_paths = [it for lst in all_image_paths for it in lst[:10]]

In [None]:
print("finished C_get_labels_of_natural_reference_and_default_images")

In [None]:
len(image_paths)

In [None]:
image_wordnet_ids = []
for item in image_paths:
    image_wordnet_ids.append({
        "references": [a.replace("$PATHTOIMAEGNET/train/", "").split("/")[0] for a in item["references"]],
        "query_image": item["query_image_path"].replace("$PATHTOIMAEGNET/train/", "").split("/")[0],
    })

In [None]:
for item in image_wordnet_ids:
    n_same = sum([ref == item["query_image"] for ref in item["references"]])
    item["n_same_labels"] = n_same

In [None]:
counts = np.array([item["n_same_labels"] for item in image_wordnet_ids])

In [None]:
from matplotlib import rcParams

rcParams["font.family"] = "sans-serif"
rcParams["font.sans-serif"] = ["DejaVu Sans"]

# output text as text and not paths
rcParams["svg.fonttype"] = "none"
rcParams["pdf.fonttype"] = "truetype"

import matplotlib.pyplot as plt

file_type = ".pdf"
fontsize_axes_labels = 10
fontsize_tick_labels = 8
x_tick_label_rotation = 30
error_bar_linewidth = 1
from matplotlib import pyplot as plt

In [None]:
def bins_labels(bins, **kwargs):
    bin_w = (max(bins) - min(bins)) / (len(bins) - 1)
    plt.xticks(np.arange(min(bins)+bin_w/2, max(bins), bin_w), bins, **kwargs)
    plt.xlim(bins[0]-0.25, bins[-1]+0.25)

In [None]:
bins = np.arange(11)
hist = plt.hist(counts, bins=bins, rwidth=0.9)
bins_labels(bins)
plt.xlabel("#Reference images w/ same label as query images")
plt.ylabel("Count")

# no axis on top and right
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)

plt.savefig("reference_query_labels_histogram.pdf")

