Generate encoded vectors for both query and artist aggregrate images

In [1]:
import read_rijksdata
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa 

import matplotlib.pylab as plt
import matplotlib as mpl
MIN_NUM_ARTWORK = 500


 The versions of TensorFlow you are currently using is 2.4.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
# LOAD IMAGE AND LABELS HERE
# replace for your path here!
img_folder = '/Users/erebor/Downloads/out_img'

images, labels_onehot, labels, names, = read_rijksdata.load_data(MIN_NUM_ARTWORK=MIN_NUM_ARTWORK,
                                                 img_folder = img_folder,
                                                 labels_file ='labels.txt',
                                                 names_file = 'names.txt')

  names = pd.read_csv(names_file,delimiter = '/t',header=None)


 |███████████████████████████████████████-| 112038/112039 

Dataset loaded!
images shape:	 (19007, 56, 56, 3)
labels shape:	 (19007,)
labels (one-hot): (19007, 21)
names shape:	 (19007, 1)


In [3]:
classes = len(list(set(labels)))
print('\n# of unique artists:',classes)

counts = pd.DataFrame(labels).value_counts()
print('Min # of artworks for all artists:',min(counts))
print('Min # of artworks specified:',MIN_NUM_ARTWORK)


# of unique artists: 21
Min # of artworks for all artists: 517
Min # of artworks specified: 500


In [4]:
# LOAD PRE-TRAINED ENCODER
# get base pre-trained model first
# more models are available here: https://www.tensorflow.org/api_docs/python/tf/keras/applications

# define hyperparameters
# define image size 
input_shape = (56,56,3)
D = 21
# define number of classes
# ****THIS sets the number of dimensions of the encoded vector ("D") in Mark's email***
# we'll probably want adjust this to be smaller or larger (depending on training results)
# for now, classes are just the number of unique artist
base_kwargs = {'include_top':False,
               'weights':'imagenet',
               'input_shape':input_shape,
               'pooling':None,
               'classes':D}
#enet_base = tf.keras.applications.efficientnet.EfficientNetB7(**enet_kwargs)
base = tf.keras.applications.vgg19.VGG19(**base_kwargs)

# set that the encoder DOES NOT train on the images
base.trainable = True

# set pre-trained model as base
encoder = tf.keras.models.Sequential()
encoder.add(base)

# add two final top layers
#encoder.add(tf.keras.layers.GlobalMaxPooling2D())
encoder.add(tf.keras.layers.GlobalMaxPooling2D())
#encoder.add(tf.keras.layers.Dropout(rate=0.01))

encoder.add(tf.keras.layers.Dense(D, activation="sigmoid")) # last (top) layer of network

In [5]:
encoder.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg19 (Functional)           (None, 1, 1, 512)         20024384  
_________________________________________________________________
global_max_pooling2d (Global (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 21)                10773     
Total params: 20,035,157
Trainable params: 20,035,157
Non-trainable params: 0
_________________________________________________________________


In [6]:
# loss function
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False,
                                               label_smoothing=0.0,
                                               name='categorical_crossentropy')

# metrics

TopKs = []
for k in [1,5,10,20]:
    TopK = tf.keras.metrics.TopKCategoricalAccuracy(k=k, name='top_{}'.format(k))
    TopKs.append(TopK)
metrics = ["acc"]
metrics.extend(TopKs)

f1 = tfa.metrics.F1Score(num_classes=classes, threshold=0.5)
metrics.append(f1)

# Optimizer
# very average Adam settings
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

#optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
# compile it all
encoder.compile(loss=loss,
                optimizer=optimizer,
                metrics=metrics)

In [None]:
history = encoder.fit(x=images,y=labels_onehot,validation_split=.20, epochs=10,batch_size=1000)

Epoch 1/10
Epoch 2/10

In [None]:
# Create encoded tensors for all 
encoded_arts = encoder.predict(images,verbose=1)

In [None]:
encoded_arts

In [None]:
# Count how many pieces each artist has
total_bc = np.bincount(labels) # get count of artists
artcounts = total_bc[np.unique(labels)] # get count of artworks for each unique artist
artistnames = names[np.unique(labels)] # get the name for each unique artist

In [None]:
def plot_images(images,artistname):

    # plot a selection of 25 (5x5) artwork
    fig, axes = plt.subplots(figsize=(10,10),nrows=5,ncols=5)
    fig.patch.set_facecolor('white')

    i = 0 
    for ax in axes.reshape(-1): 
        ax.imshow(images[i,:,:,:])
        ax.set_xticks([]),ax.set_yticks([])
        i +=1
    plt.suptitle('Artist: {}'.format(artistname),fontsize=15)
    fig.subplots_adjust(top=0.9)
    plt.savefig('figs/samples/artist_{}.png'.format(artistname[0].replace(',','-').replace(' ','-')),dpi=200, bbox_inches='tight')
    plt.show()
    plt.close()

In [None]:
artcounts[i]//2

In [None]:
# Create aggregate vectors
aggregate_vectors = []

for i in range(len(artcounts)):
    artistnum = np.unique(labels)[i] #Gets the number that represents this artist from labels
    artistname = artistnames[i]
    artcount = artcounts[i] #Gets number of art pieces by this artist
    artistaddr = np.where(labels == artistnum) #Collects indices of this artist's art
    
    #calculate mean (aggregrate) vector for a single artist 
    #print('artist:',artistname,' vectors shape:',encoded_arts[artistaddr].shape)
    #print(encoded_arts[artistaddr])
    #print()
    
    # PREVIEW images for each artist 
    #plot_images(images[artistaddr],artistname)
    
    # take mean of all of the artist's vectors and store
    artist_vector = np.mean(encoded_arts[artistaddr],axis=0)
    
    aggregate_vectors.append(artist_vector)

    #x = np.expand_dims(encoded_arts[artistaddr].T,axis=2)
    #artist_vector = pool_layer(x).numpy()
    #aggregate_vectors.append(artist_vector.reshape(50,))
      
aggregate_vectors = np.array(aggregate_vectors)

In [None]:
plt.figure()
bins = np.arange(0,1.1,0.05)
for vector in aggregate_vectors:
    plt.hist(vector,bins=bins,alpha=0.25,histtype='bar')
plt.xlim(-0.05,1.05)
plt.show()
plt.close()

In [None]:
def plot_aggregates(aggregate_vectors,artistnames,n=3):
    idx = list(range(aggregate_vectors.shape[0]))
    idxs = np.random.choice(a=idx,size=n*n)
    vectors = aggregate_vectors[idxs]
    artists = artistnames[idxs]
    
    fig, axes = plt.subplots(nrows=n,ncols=n,figsize=(9,5))
    for ax_idx, ax in enumerate(fig.axes):
        norm = mpl.colors.Normalize(vmin=0, vmax=1)
        ax.imshow(np.atleast_2d(vectors[ax_idx]), aspect=7, cmap='rainbow', interpolation=None,norm=norm)
        ax.set_xticks([]),ax.set_yticks([])
        ax.set_xlabel(artists[ax_idx][0])
    plt.savefig('figs/aggregrates_sample_trained.png',dpi=200,tight_layout=True)
    plt.show()
    plt.close()

In [None]:
plot_aggregates(aggregate_vectors,artistnames,n=4)

In [None]:
# Query Image Removal Function
def query_image_remover(qi_vec, avg_vec, artnum):
    new_vec = (tf.math.subtract(avg_vec,  qi_vec * (1/artnum)) * (artnum/(artnum-1)))
    return new_vec