In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from config import *
import os
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from keras.applications import resnet
import imageio as io
import pandas as pd
from keras.models import Model, load_model
import numpy as np

In [3]:
# set up the base model and load the weight
sample_img = data_path + "/test/test/10041_1.jpeg" 
input_shape = io.imread(sample_img).shape

datagen = ImageDataGenerator(preprocessing_function=resnet.preprocess_input)
    
test_dir = data_path + "/test"
    
# generator function for data augmentation
def multi_input_data_gen(X1_dir, X2_dir, X3_dir, batch_size, seed):
            
    genX1 = datagen.flow_from_directory(X1_dir, (input_shape[0], input_shape[1]), 
                                                batch_size=batch_size, seed=seed, shuffle=False)
            
    genX2 = datagen.flow_from_directory(X2_dir, (input_shape[0], input_shape[1]), 
                                                batch_size=batch_size, seed=seed, shuffle=False)
            
    genX3 = datagen.flow_from_directory(X3_dir, (input_shape[0], input_shape[1]), 
                                                batch_size=batch_size, seed=seed, shuffle=False)
    
    while True:
        X1i = genX1.next()
        X2i = genX2.next()
        X3i = genX3.next()

        yield [X1i[0], X2i[0], X3i[0]]
                

test_data_gen = datagen.flow_from_directory(directory =  data_path + "/test", 
                                                    target_size = (input_shape[0], input_shape[1]), 
                                                    batch_size = 1, 
                                                    class_mode = None, 
                                                    shuffle = False, seed=seed)
def number_of_data(data_path, dataset):
    
    if dataset != "test":
        dataset_path = data_path + "/" + dataset + "/anchor"
    else:
        dataset_path = data_path + "/" + dataset
    arr1 = os.listdir(dataset_path)
    arr1.sort()
    
    img_number = 0
    if arr1[0] == '.ipynb_checkpoints':
        arr1 = arr1[1:]
    
    for i in range(len(arr1)):
        img_path = dataset_path + "/" + arr1[i]
        arr2 = os.listdir(img_path)
        arr2.sort()
        arr2
        if arr2[0] == '.ipynb_checkpoints':
            arr2 = arr2[1:]
            
        if dataset == "test":
            classes = []
            for i in arr2:
                classes.append(i[0:5])
            class_num = len(set(classes))
        else:
            class_num = len(arr1)            
                
        img_number = img_number + len(arr2)
    return class_num, img_number

_ , test_step_size = number_of_data(data_path, "test") 
test_step_size = test_step_size//1

test_data_gen.reset()


Found 10600 images belonging to 1 classes.


In [4]:
base_model = load_model(model_save_path + "/" + names + ".h5")

embedd_test_data = pd.DataFrame(base_model.predict(test_data_gen, steps=test_step_size, verbose=1), index = test_data_gen.filenames)



In [5]:
# pass through the reference data to the model and write their embedding into a file
img_list = os.listdir(reference_data_path)
img_list = set([img[0:5] for img in img_list if img[0] != "."])
    
def image_preprocess(path):
    img = load_img(path, grayscale=False, color_mode="rgb", target_size=(1024, 768), interpolation="nearest")
    return (img_to_array(img)).reshape((1,1024, 768,3))

emb = [base_model(resnet.preprocess_input(image_preprocess(reference_data_path + i + ".jpg"))).numpy().reshape((256)) for i in img_list]
    
embedding = pd.DataFrame(emb, index = img_list).T
embedding.to_csv(reference_embedding_save_path)


In [6]:
# calculate the cosine similarity between each testing data & each reference embedding
from sklearn.metrics import top_k_accuracy_score
def cal_cos_sim(embedd_test_data, embedd_database):
    
    img_dot_database = (embedd_test_data @ embedd_database)
    img_len = np.sqrt((embedd_test_data * embedd_test_data).sum(axis = 1))
    database_len = np.sqrt((embedd_database * embedd_database).sum())
    
    cos_sim = (img_dot_database/(pd.DataFrame(img_len) @ pd.DataFrame(database_len).T))
    colums_to_be_kept = np.append(["Classes", "Classes_index"], cos_sim.columns.values)
    cos_sim["ori_idex"] = cos_sim.index
    cos_sim[["Dataset", "Filename"]] = cos_sim["ori_idex"].str.split('\\', 1, expand=True)
    cos_sim[["Classes", "Form_Number"]] = cos_sim['Filename'].str.split('_', 1, expand=True)
    cos_sim.index = cos_sim.Filename
    cos_sim["Classes_index"] = cos_sim.Classes.apply(lambda x: cos_sim.columns.values.tolist().index(x))
    
    return cos_sim[colums_to_be_kept]
    
# Cal cosine similarity
cos_sim = cal_cos_sim(embedd_test_data, embedding)

In [7]:
#save_path = result_save_path + "/" + model_name + ".csv"
#cos_sim.to_csv(save_path)

In [8]:
# Calculate the top K accuracy based on the cosine simiarity
Top1Acc = top_k_accuracy_score(cos_sim.Classes_index.values, cos_sim.iloc[:,2:].values, k=1)
Top3Acc = top_k_accuracy_score(cos_sim.Classes_index.values, cos_sim.iloc[:,2:].values, k=3)
Top5Acc = top_k_accuracy_score(cos_sim.Classes_index.values, cos_sim.iloc[:,2:].values, k=5)
    
print("Top1 Accuracy : %6.3f, Top3 Accuracy: %6.3f, Top5 Accuracy: %6.3f." % (Top1Acc, Top3Acc, Top5Acc))

Top1 Accuracy :  0.394, Top3 Accuracy:  0.643, Top5 Accuracy:  0.778.


In [None]:
Top1 Accuracy :  0.334, Top3 Accuracy:  0.565, Top5 Accuracy:  0.711. (0.5)

Top1 Accuracy :  0.392, Top3 Accuracy:  0.642, Top5 Accuracy:  0.765. (1)
            
Top1 Accuracy :  0.394, Top3 Accuracy:  0.643, Top5 Accuracy:  0.778. (2)