This notebook contains code that was used for preprocessing of data and other helper functions.

### Collect random sample from ImageNet

In [None]:
from random import randint
import tensorflow as tf
import os
import urllib
from urllib.parse import urlparse
from PIL import Image
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import json
import random
from sklearn.decomposition import PCA
import glob
import os.path

In [None]:
def file_len(fname):
    with open(fname, 'rb') as f:
        for i, l in enumerate(f):
            pass
    return i + 1

def sample_imagenet(ind, size=10):
    rand_dir = 'random_' + str(ind)
    os.makedirs(rand_dir)
    fname = '../fall11_urls.txt'
    images = open(fname,'rb')
    whatlines = []
    for line in range(size):
        whatlines.append(randint(0,file_len(fname)))
        
    sample = [x.decode("utf-8").split('\t')[1] for i, x in enumerate(images) if i in whatlines]   
    for item in sample:
        a = urlparse(item)
        name = os.path.basename(a.path)        
        try:
            urllib.request.urlretrieve(item, '/tmp/' + name)
            im=Image.open('/tmp/' + name)
            os.rename('/tmp/' + name, './'+rand_dir + '/' + name)

        except:
            pass
        
    return sample

In [None]:
# Create random experiments datasets:
number_expriments = 10
for i in range(number_expriments):
    _ = sample_imagenet(i,size=100)

### Reduce dimension and plot explained variance ratio

In [None]:
class DimensionReducer():
    def __init__(self, activations, pc_components=4):
        self.flattened = self.flatten(activations)
        print(np.shape(self.flattened))
        self.activations = activations
        self.pca = PCA(n_components=pc_components)
        self.pca.fit(self.flattened)
        self.save_params()

    def save_params(self):
        self.mu = np.mean(self.flattened, axis=0)
        with open('pca_params','w') as f:
            f.write(json.dumps({"mean":self.mu.tolist(),"components":self.pca.components_.tolist()}))

    def flatten(self, activations):
        return [x.flatten() for x in activations]
    
    def get_reduced(self, activations):
        self.reduced = self.pca.transform(self.flatten(activations))
        return self.reduced
        
    def get_random_concept(self, n=100): # n - number of random images to draw

        mu = np.mean(self.flattened, axis=0)

        b = max(self.reduced[:,0])
        a = min(self.reduced[:,0])
        b1 = max(self.reduced[:,1])
        a1 = min(self.reduced[:,1])    
        random_pca1 = (b - a) * np.random.random_sample(n) + a
        random_pca2 = (b1 - a1) * np.random.random_sample(n) + a1
        random_activations = np.stack((random_pca1, random_pca2), axis=-1)

        Xhat = np.dot(random_activations, self.pca.components_)
        Xhat += mu

        return Xhat

#### Get sample of activations from every class

In [None]:
dir_name = './acts_grads/'

acts_files = [dir_name + f for i,f in enumerate(os.listdir(dir_name)) if 'acts' in f]
grads_files = [dir_name + f for i,f in enumerate(os.listdir(dir_name)) if 'grads' in f]

classes = None
with open('./concept-vis/data/classes.json') as f:
    classes = json.load(f)
cls = {x['id']:x['class'] for x in classes[:50]}

def id2class(x):
    return cls[x].split(',')[0]

def get_sample(name):
    data = np.load(name)
    dim = np.shape(data)[2:]
    data = data.reshape((-1, *dim))
    draw = random.choices(data,k=3)
    return draw

def get_sample_of_activations(acts_files):
    sample = []
    for z in acts_files:
        print(z)
        draw = get_sample(z)
        if os.path.isfile('./backup.npy'):
            sample = np.load('backup.npy')
            sample = np.concatenate((sample,draw))
        else:
            sample = draw
            
        np.save('backup', sample)
            
        print(np.shape(sample))

In [None]:
sample = get_sample_of_activations(acts_files)
print(np.shape(sample))

In [None]:
sample = np.load('../tcav_backup/backup.npy')
dr = DimensionReducer(np.array(sample),pc_components=35)

#### Plot explained variance

In [None]:
variance = dr.pca.explained_variance_ratio_
plt.figure(figsize=(10,7))
plt.bar(range(1,1+len(variance)),variance)
plt.xlabel('principal component nr')
plt.ylabel("explained variance ratio")
plt.show()

In [None]:
matplotlib.rcParams.update({'font.size': 22})
plt.figure(figsize=(10,5))
plt.hist(variance)
plt.xlabel("Explained variance ratio [%]")
plt.ylabel("Count")
plt.show()

#### Reduce dimension of whole dataset and save components as a points.js file

In [None]:
for batch_of_acts, batch_of_grads in zip(acts_files, grads_files):
    folder = batch_of_acts.split('/')[-1].split('_')[0]   
    if folder in cls.keys():
        names = None
        with open(dir_name + folder + '_names') as f:
            names = [ x.strip() for x in f.readlines()]

        paths = ['https://0.0.0.0:8009/images/'+ folder + '/' + x for x in names]
        data = np.load(batch_of_acts) 
        dim = np.shape(data)[2:]
        reduced = dr.get_reduced(data.reshape((-1, *dim)))
        data_grad = np.load(batch_of_grads) 
        dim = np.shape(data_grad)[2:]
        reduced_grad = dr.get_reduced(data_grad.reshape((-1, *dim)))

        points = []
        for i, item in enumerate(zip(paths,reduced,reduced_grad)):
            tmp = {'coords': [*item[1]], 'id': folder + '_' + str(i), 'patch_path': item[0], 'class':id2class(folder), 'class_id':folder, 'gradient':[*item[2]]}
            points.append(tmp)   
        with open("../points/" + batch_of_acts.split('/')[-1] + ".json",'w') as f:
            json.dump(points, f)

        
json_dir = '../points/'
data = []
for x in os.listdir(json_dir):
    if x.split('_')[0].startswith('n') and 'sorted' in x.split('_')[1]:

        with open(json_dir + x) as f:
            tmp = json.loads(f.read())           
            data.extend(tmp)
        
with open("../concept-vis/data/points.js",'w') as f:
    f.write("var points_data = {}".format(json.dumps(data)))

#### Create heatmap.js file

In [None]:
data = None
with open("data/points.js") as f:
    data = f.read()[18:]

points = json.loads(data)

pc0 = [x['coords'][0] for x in points]
pc1 = [x['coords'][1] for x in points]

g0 = [x['gradient'][0] for x in points]
g1 = [x['gradient'][1] for x in points]
print("var min_g0 = {};".format(min(g0)))
print("var min_g1 = {};".format(min(g1)))
print("var max_g0 = {};".format(max(g0)))
print("var max_g1 = {};".format(max(g1)))



print("var min_x = {};".format(min(pc0)))
print("var max_x = {};".format(max(pc0)))
print("var min_y = {};".format(min(pc1)))
print("var max_y = {};".format(max(pc1)))




# simulate data for the background heatmap
error_heatmap = []
for x in np.linspace(min(pc0), max(pc0), 100):
    for y in np.linspace(min(pc0), max(pc1), 100):
        error_heatmap.append({
            "coords": [x, y],
            "error": 0.7
        })

with open("data/heatmap.js", "w") as f:
    f.write("var hm_data = {}".format(json.dumps(error_heatmap)))

#### Create classes_short.js (list of classes from data)

In [None]:
with open('./data/points.js') as f:
    data = json.loads(f.read()[18:])

classes = []
names = set()
for x in data:
    if x['class'] not in names:
        classes.append({'class':x['class'],"id":x['class_id']})
        names.add(x['class'])

with open('classes_short.js', 'w') as f:
    f.write('var classes_short ='+json.dumps(classes))