Generate encoded vectors for both query and artist aggregrate images

In [1]:
import dataset
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa 

import matplotlib.pylab as plt
import matplotlib as mpl
MIN_NUM_ARTWORK = 500


ModuleNotFoundError: No module named 'read_rijksdata'

In [None]:
# LOAD IMAGE AND LABELS HERE
# replace for your path here!
img_folder = '/Users/erebor/Downloads/out_img'

images, labels_onehot, labels, names, = dataset.load_data(MIN_NUM_ARTWORK=MIN_NUM_ARTWORK,
                                                 img_folder = img_folder,
                                                 labels_file ='labels.txt',
                                                 names_file = 'names.txt')

In [None]:
classes = len(list(set(labels)))
print('\n# of unique artists:',classes)

counts = pd.DataFrame(labels).value_counts()
print('Min # of artworks for all artists:',min(counts))
print('Min # of artworks specified:',MIN_NUM_ARTWORK)

In [None]:
# LOAD PRE-TRAINED ENCODER
# get base pre-trained model first
# more models are available here: https://www.tensorflow.org/api_docs/python/tf/keras/applications

# define hyperparameters
# define image size 
input_shape = (56,56,3)
D = 50
# define number of classes
# ****THIS sets the number of dimensions of the encoded vector ("D") in Mark's email***
# we'll probably want adjust this to be smaller or larger (depending on training results)
# for now, classes are just the number of unique artist
base_kwargs = {'include_top':False,
               'weights':'imagenet',
               'input_shape':input_shape,
               'pooling':None,
               'classes':D}
#enet_base = tf.keras.applications.efficientnet.EfficientNetB7(**enet_kwargs)
base = tf.keras.applications.vgg19.VGG19(**base_kwargs)

# set that the encoder DOES NOT train on the images
base.trainable = True

# set pre-trained model as base
encoder = tf.keras.models.Sequential()
encoder.add(base)

# add two final top layers
#encoder.add(tf.keras.layers.GlobalMaxPooling2D())
encoder.add(tf.keras.layers.GlobalMaxPooling2D())
#encoder.add(tf.keras.layers.Dropout(rate=0.01))

encoder.add(tf.keras.layers.Dense(D, activation="sigmoid")) # last (top) layer of network

In [None]:
encoder.summary()

In [None]:
# Create encoded tensors for all 
vectors = encoder.predict(images,verbose=1)

In [None]:
vectors

In [None]:
def plot_images(images,artistname):

    # plot a selection of 25 (5x5) artwork
    fig, axes = plt.subplots(figsize=(10,10),nrows=5,ncols=5)
    fig.patch.set_facecolor('white')

    i = 0 
    for ax in axes.reshape(-1): 
        ax.imshow(images[i,:,:,:])
        ax.set_xticks([]),ax.set_yticks([])
        i +=1
    plt.suptitle('Artist: {}'.format(artistname),fontsize=15)
    fig.subplots_adjust(top=0.9)
    plt.savefig('figs/samples/artist_{}.png'.format(artistname[0].replace(',','-').replace(' ','-')),dpi=200, bbox_inches='tight')
    plt.show()
    plt.close()

In [None]:
def get_aggregrate_vectors(vectors,labels):
    # Create aggregate vectors
    # Count how many pieces each artist has
    total_bc = np.bincount(labels) # get count of artists
    artcounts = total_bc[np.unique(labels)] # get count of artworks for each unique artist
    artistnames = names[np.unique(labels)] # get the name for each unique artist

    aggregate_vectors = []
    for i in range(len(artcounts)):
        artistnum = np.unique(labels)[i] #Gets the number that represents this artist from labels
        artistname = artistnames[i]
        artcount = artcounts[i] #Gets number of art pieces by this artist

        neg_idx = np.where(labels != artistnum) 
        pos_idx = np.where(labels == artistnum)
        artist_vector = np.mean(vectors[pos_idx],axis=0)

        query_images = images[neg_idx]
        query_vectors = vectors[neg_idx]

        aggregate_vectors.append(artist_vector)

        #x = np.expand_dims(vectors[pos_idx].T,axis=2)
        #artist_vector = pool_layer(x).numpy()
        #aggregate_vectors.append(artist_vector.reshape(50,))
    aggregate_vectors = np.array(aggregate_vectors)
    
    return aggregate_vectors, artistnames

In [None]:
def plot_aggregates(aggregate_vectors,artistnames,n=3):
    idx = list(range(aggregate_vectors.shape[0]))
    idxs = np.random.choice(a=idx,size=n*n)
    vectors = aggregate_vectors[idxs]
    artists = artistnames[idxs]
    
    fig, axes = plt.subplots(nrows=n,ncols=n,figsize=(9,5))
    for ax_idx, ax in enumerate(fig.axes):
        norm = mpl.colors.Normalize(vmin=0, vmax=1)
        ax.imshow(np.atleast_2d(vectors[ax_idx]), aspect=7, cmap='rainbow', interpolation=None,norm=norm)
        ax.set_xticks([]),ax.set_yticks([])
        ax.set_xlabel(artists[ax_idx][0])
    plt.savefig('figs/aggregrates_sample_trained.png',dpi=200,tight_layout=True)
    plt.show()
    plt.close()

In [None]:
def plot_aggregate_dist(aggregate_vectors):
    plt.figure()
    bins = np.arange(0,1.1,0.05)
    for vector in aggregate_vectors:
        plt.hist(vector,bins=bins,alpha=0.25,histtype='bar')
    plt.xlim(-0.05,1.05)
    plt.show()
    plt.title('Aggregrate Vectors')
    plt.close()

In [None]:
aggregate_vectors, artistnames = get_aggregrate_vectors(vectors=vectors,labels=labels)
plot_aggregates(aggregate_vectors,artistnames,n=4)
plot_aggregate_dist(aggregate_vectors)

In [None]:
# Query Image Removal Function
def query_image_remover(qi_vec, avg_vec, artnum):
    new_vec = (avg_vec - (qi_vec * (1/artnum))) * (artnum/(artnum-1))
    return new_vec

In [None]:
print(vectors)

In [65]:
NUM_EXAMPLE = 20
train_val_split=0.8
dev_split = 0.1

total_bc = np.bincount(labels) # get count of artists
artcounts = total_bc[np.unique(labels)] # get count of artworks for each unique artist
artistnames = names[np.unique(labels)] # get the name for each unique artist

aggregate_vectors = []
artistnums = []
# iterate through each artist
for i in range(len(artcounts)):
    artistnum = np.unique(labels)[i] # Gets the number that represents this artist from labels
    artistnums.append(artistnum)
    
    artistname = artistnames[i] # Gets artist name as string
    artcount = artcounts[i] # Gets number of art pieces by this artist

    # get indicies of artist's artwork
    pos_idx = np.where(labels == artistnum)

    aggregate_vector = np.mean(vectors[pos_idx],axis=0)

    # store aggregrate vectors for each artist
    aggregate_vectors.append(aggregate_vector)


aggregate_vectors = np.array(aggregate_vectors)

# Generate pairs
total_pairs = np.zeros(shape=(len(labels)*2,2))
total_labels = np.zeros(shape=(len(labels)*2,1))

j = 0
for i in range(len(artcounts)):
    artistnum = np.unique(labels)[i]
    artcount = artcounts[i]

    #Retreiving negative and positive indices
    pos_idx = np.where(labels == artistnum)[0]
    neg_idx = np.where(labels != artistnum)[0]


    #Adding Positive Pairs for a given artist
    for idx in pos_idx:
        #print('j:',j)
        #print('total pairs:', total_pairs)
        #print('idx:',idx)
        #print('artistnum:',artistnum)
        total_pairs[j,:] = [idx,artistnum]
        total_labels[j] = 1
        j = j + 1

    #Adding Negative Pairs for a given artist
    neg_selec = np.random.choice(neg_idx,artcount,replace=False)
    for idx in neg_selec:
        total_pairs[j,:] = [idx,artistnum]
        total_labels[j] = 0
        j = j + 1

# Create and Test Split Order
goodbalance = False
spltest_labels = labels

while goodbalance == False:

    #Shuffle data
    mixer = np.arange(len(labels))
    np.random.shuffle(mixer)
    total_pairs = total_pairs[mixer,:]
    total_labels = total_labels[mixer]

    #Make Cutoffs
    train_cutoff = int(len(total_pairs) * train_val_split)
    dev_cutoff = int(len(total_pairs) * (train_val_split + dev_split))

    #Test Splits for Balance
    spltest_labels = spltest_labels[mixer]
    train_spltest = spltest_labels[:train_cutoff]
    dev_spltest = spltest_labels[train_cutoff:dev_cutoff]
    val_spltest = spltest_labels[dev_cutoff:]

    train_bc = np.bincount(train_spltest)
    dev_bc = np.bincount(dev_spltest)
    val_bc = np.bincount(val_spltest)

    check_bool = np.array([],dtype=bool)

    for i in np.unique(labels):
        if (len(np.unique(spltest_labels)) == len(np.unique(train_spltest))) and (len(np.unique(spltest_labels)) == len(np.unique(dev_spltest))) and (len(np.unique(spltest_labels)) == len(np.unique(train_spltest))):
            train_check = abs(total_bc[i]*0.8 - train_bc[i]) >= total_bc[i]*0.2 
            dev_check = abs(total_bc[i]*0.1 - dev_bc[i]) >= total_bc[i]*0.08
            val_check = abs(total_bc[i]*0.1 - val_bc[i]) >= total_bc[i]*0.08 
            check_bool = np.append(check_bool,(train_check or dev_check or val_check))

        elif total_bc[i] != 0:
            check_bool = np.append(check_bool,False)

    if sum(check_bool) <= 0:
        goodbalance = True

# Turn Pairs of Indices into Pairs of Vectors
final_pairs = []
for total_pair in enumerate(total_pairs):
    
    # get query vector index
    vector_idx = int(total_pair[0])
    
    # get aggregrate index  
    aggregrate_idx = np.where(artistnums==total_pair[-1])[0][0]

    # get vectors
    aggregrate_vector = aggregate_vectors[aggregrate_idx]
    query_vector = vectors[vector_idx]

    # save to array
    final_pair = np.array([aggregrate_vector,query_vector]).T
    final_pairs.append(final_pair)

final_pairs = np.array(final_pairs)
print(final_pairs.shape)
# set aggregrate and query image pairs and labels into different
train_pairs = final_pairs[:train_cutoff,:,:]
train_labels = total_labels[:train_cutoff]

dev_pairs = final_pairs[train_cutoff:dev_cutoff,:,:]
dev_labels = total_labels[train_cutoff:dev_cutoff]

val_pairs = final_pairs[dev_cutoff:,:,:]
val_labels = total_labels[dev_cutoff:]

(19007, 50, 2)


In [74]:
np.unique(labels)

array([ 631,  685,  912,  993, 1244, 1981, 2107, 2193, 2272, 2724, 2784,
       2819, 3537, 3718, 3849, 3981, 4682, 4766, 5004, 5247, 6217])

In [75]:
np.unique(total_pairs[:,1])

array([ 631.,  685.,  912.,  993., 1244., 1981., 2107., 2193., 2272.,
       2724., 2784., 2819.])