In [None]:
print("starting 1_save_natural_reference_images_new")

This notebook saves ut.n_batches of ut.n_bins (== batch-size) most strongly activating images as well as the default ImageNet images for the occlusion stimuli on share.


Large parts of this notebook are taken from https://github.com/bethgelab/testing_visualizations/blob/master/generate_stimuli/save_natural_stimuli.py on 14.04.2020.

# Imports

In [None]:
import pandas as pd
import numpy as np
import os
import csv
import ast
import time
from torch.utils import data
from tqdm.auto import tqdm
from torchvision.datasets.folder import default_loader as default_image_loader
from torchvision import transforms
import shutil

In [None]:
import argparse

In [None]:
import occlusion_utils as ut

# Parameters

In [None]:
csv_max_filename = "activation_max.csv"

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--stimuli-dir", required=True, help="Path to save stimuli to.")
parser.add_argument("-t", "--trial-type", required=True, help="instruction_practice_catch or sampled_trials.")
args = parser.parse_args()
print(args)

In [None]:
stimuli_dir = args.stimuli_dir
trial_type = args.trial_type
n_reference_images, n_bins, _ = ut.get_number_of_stimuli(stimuli_dir)

# Load units

In [None]:
# read in units (layer, kernel_size, feature map) of interest
path_to_csv_file = os.path.join(stimuli_dir, f"layer_folder_mapping_{trial_type}.csv")
print(f"read in files from {path_to_csv_file}")
unit_specs_df = pd.read_csv(path_to_csv_file, header=1)

In [None]:
unit_specs_df

# Helper code

In [None]:
center_crop_transform = transforms.Compose(
        [transforms.Resize(256), transforms.CenterCrop(224)]
    )

In [None]:
class ImageFileListDataSet(data.Dataset):
    def __init__(self, file_list, transform=None, target_transform=None):
        self.file_list = file_list
        self.transform = transform
        self.target_transform = target_transform

        self.loader = default_image_loader

    def __getitem__(self, index):
        impath = self.file_list[index]
        img = self.loader(impath)
        if self.transform is not None:
            img = self.transform(img)

        return img

    def __len__(self):
        return len(self.file_list)

In [None]:
def get_randomized_indices_except_for_last_bin(n_bins, n_batches, seed):
    """generate randomized order of indices except for the last bin.
    This means that for all but the last bin the 20 images that belong to one 
    bin (e.g. min_0.png) is different:
    randomize(0...19), then randomize(20...39) ... randomize(160...179).
    The last bin stays in order: (180 181 182 ... 199).
    Use this function for between-subject designs!!!
    """
    randomized_indices = np.empty([n_bins * n_batches])
    array_of_n_batches = np.arange(n_batches)

    np.random.seed(seed)
    for bin_i in range(n_bins):
        # keep the order of the largest indices in order
        if bin_i == n_bins - 1:
            randomized_indices[
                (n_batches * bin_i) : (n_batches + bin_i * n_batches)
            ] = array_of_n_batches + bin_i * n_batches
        # permute the order of all but the largest indices
        else:
            randomized_indices[
                (n_batches * bin_i) : (n_batches + bin_i * n_batches)
            ] = np.random.permutation(array_of_n_batches + bin_i * n_batches)

    return randomized_indices

# Copy images over

In [None]:
layer_names_number_dict = {
    'mixed3a': 0,
    'mixed3b': 1,
    'mixed4a': 2,
    'mixed4b': 3,
    'mixed4c': 4,
    'mixed4d': 5,
    'mixed4e': 6,
    'mixed5a': 7,
    'mixed5b': 8
}

In [None]:
# iterate over dataframe
for _, row in tqdm(unit_specs_df.iterrows()):
    start = time.time()
    kernel_size = row["kernel_size_number"]
    channel_number = row["channel_number"]
    feature_map = row["feature_map_number"]
    layer_number = row["layer_number"]
    layer_name = row["layer_name"]
    print(f"layer_name {layer_name}, feature_map {feature_map}, channel {channel_number}, kernel_size {kernel_size}")
    
    # folder where csv with activations for all 50,000 images is stored
    csv_filename = f"$DATAPATH/all_activations_imagenet_train/channel/sampled_trials/layer_{layer_names_number_dict[layer_name]}/activations_whole_dataset.pkl"
    
    # load this csv into df (takes long!)
    print("Loading df")
    # df = pd.read_csv(csv_filename, header=1, converters={'activation': ast.literal_eval})
    df = pd.read_pickle(csv_filename)
    
    print("df loaded")
    
    # select relevant feature map and sort in descending order
    df_expanded = df.copy()
    df_expanded["selected_activation"] = df["activation"].apply(lambda x: x[feature_map])
    df_expanded_sorted = df_expanded.sort_values("selected_activation", ascending=True)
    
    # create dataframes with relevant columns and rows only. Also, randomize the order in one image bin
    assert n_bins <= 10, "n_bins is upper bounded by 10"

    # generate indices always under the assumption that we need 10 bins
    max_indices = get_randomized_indices_except_for_last_bin(
        10, ut.n_batches, seed=feature_map + 1
    )
    # then only use last n_bins bins and shift their index values
    max_indices = max_indices[-n_bins*ut.n_batches:] - ((10 - n_bins) * ut.n_batches)
    max_images_activations_df = (
        df_expanded_sorted[-n_bins * ut.n_batches :]
        .drop(["activation", "target class"], axis=1)
        .iloc[max_indices]
    )
    
    # load dataset
    max_file_names = max_images_activations_df["path to image"].tolist()
    max_dataset = ImageFileListDataSet(
        max_file_names, transform=center_crop_transform
    )
    
    for image_idx_in_batch in tqdm(range(n_bins), position=1, leave=False):
        for batch in tqdm(range(ut.n_batches), position=2, leave=False):
            image_idx = batch + ut.n_batches * image_idx_in_batch

            this_activation = max_images_activations_df["selected_activation"].iloc[image_idx]

            # save image
            max_image = max_dataset[image_idx]
            output_dir = os.path.join(
                stimuli_dir,
                "channel",
                trial_type,
                f"layer_{layer_number}",
                f"kernel_size_{kernel_size}",
                f"channel_{channel_number}",
                "natural_images",
                f"batch_{batch}",
            )
            os.makedirs(output_dir, exist_ok=True)
            max_filename = os.path.join(output_dir, f"reference_max_{image_idx_in_batch}.png")
            max_image.save(max_filename)


            # save activation to csv
            # initialize csv file if it does not exist yet
            if image_idx_in_batch == 0:
                with open(
                    os.path.join(output_dir, csv_max_filename), "w"
                ) as csvFile:
                    csv_writer = csv.writer(
                        csvFile, delimiter=",", lineterminator="\n"
                    )
                    csv_writer.writerow(["image_path", "idx", "activation"])
                csvFile.close()
            # fill csv file
            with open(os.path.join(output_dir, csv_max_filename), "a") as csvFile:
                csv_writer = csv.writer(csvFile, delimiter=",", lineterminator="\n")
                csv_writer.writerow(
                    [
                        max_images_activations_df["path to image"].iloc[image_idx],
                        image_idx_in_batch,
                        max_images_activations_df["selected_activation"].iloc[image_idx],
                    ]
                )
            csvFile.close()
            
            # save unprocessed image for occlusion calculations
            if image_idx_in_batch == n_reference_images:
                img_file = max_images_activations_df["path to image"].iloc[image_idx]

                destination_dir = os.path.join(
                    stimuli_dir,
                    "channel", 
                    trial_type,
                    f"layer_{layer_number}", 
                    f"kernel_size_{kernel_size}", 
                    f"channel_{channel_number}", 
                    "natural_images",
                    f"batch_{batch}",
                    "val",
                    img_file.split(os.path.sep)[-2]) #n...
                os.makedirs(destination_dir, exist_ok=True)
                print(f"destination_dir {destination_dir}")
                shutil.copy(img_file, destination_dir)
                
    end = time.time()
    print(f"       time for one layer: {end-start}")

In [None]:
print("finished 1_save_natural_reference_images_new")