In [24]:
#
# This code is for converting images to embeddings, using pretrained and trained
# models
#

from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input
from keras import Model
import numpy as np


from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
!ls 
!ls datasets/

datasets  drive  extracted.txt	feature-dump.npz  sample_data  true-classes.npz
caltech256  caltech256.py  datautils.py  __init__.py


In [0]:
!tar -xvf drive/My\ Drive/256_ObjectCategories.tar > extracted.txt

In [7]:
!mkdir -p datasets
!cp drive/My\ Drive/__init__.py datasets/
!cp drive/My\ Drive/caltech256.py datasets/
!cp drive/My\ Drive/datautils.py datasets/
!mv 256_ObjectCategories/ datasets/caltech256
!ls datasets/
!ls

caltech256  caltech256.py  datautils.py  __init__.py
datasets  drive  extracted.txt	sample_data


In [0]:
import os
image_types = (".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff")
def list_images(basePath, contains=None):
    # return the set of files that are valid
    return list_files(basePath, validExts=image_types, contains=contains)
def list_files(basePath, validExts=None, contains=None):
    # loop over the directory structure
    for (rootDir, dirNames, filenames) in os.walk(basePath):
        # loop over the filenames in the current directory
        for filename in filenames:
            # if the contains string is not none and the filename does not contain
            # the supplied string, then ignore the file
            if contains is not None and filename.find(contains) == -1:
                continue

            # determine the file extension of the current file
            ext = filename[filename.rfind("."):].lower()

            # check to see if the file is an image and should be processed
            if validExts is None or ext.endswith(validExts):
                # construct the path to the image and yield it
                imagePath = os.path.join(rootDir, filename)
                yield imagePath

def get_images(QUERY):
    '''
    Returns training, querying dataset as a list of lists, where inner list is
    of categories. Category starts from index 1 and not 0. This just gives list
    of paths.
    '''
    labels           = []
    dataset          = []
    image_paths      = list(list_images("./datasets/caltech256"))
    stats = {}
    for (i, image_path) in enumerate(image_paths):
        label = int(image_path.split(os.path.sep)[-1].split("_")[0])
        if(label not in stats):
            stats[label] = 1
        else:
            stats[label] += 1
        dataset.append([image_path, label])
    dataset = sorted(dataset, key = lambda x: x[1])
    training_images = [[]]
    query_images    = [[]]
    dataset_ptr     = 0
    for cat in range(1, 258): # 257 labels
        num_queries = int(stats[cat] * QUERY)

        start = dataset_ptr
        mid   = dataset_ptr + num_queries
        end   = dataset_ptr + stats[cat]

        query_images.append(dataset[start: mid])
        training_images.append(dataset[mid: end])

        dataset_ptr += stats[cat]


    for cat in range(1, 258):
        #print(len(query_images[cat]), len(training_images[cat]), stats[cat])
        assert(len(query_images[cat]) + len(training_images[cat]) == stats[cat])
    print("Splitting done")

    return training_images, query_images

In [27]:
## get dataset splits
import os
training_paths, query_paths = get_images(0.10)
print("DONE")


## training code, get model
## pretrained model
model = ResNet50(weights='imagenet')

##
training_feature_vectors = []
query_feature_vectors = []

PATH = 0
CAT  = 1
intermediate_layer_model = Model(inputs=model.input, \
                                 outputs=model.get_layer('avg_pool').output)
for category in range(1, 2):
    ctr  = 0
    print("Starting category #", category)
    for im in training_paths[category]:
        img = image.load_img(im[0], target_size=(224, 224))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        emb = intermediate_layer_model.predict(img_data)
        np_emb = np.array(emb).flatten()
        training_feature_vectors.append(np_emb)
        ctr += 1
    for im in query_paths[category]:
        img = image.load_img(im[0], target_size=(224, 224))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        emb = intermediate_layer_model.predict(img_data)
        np_emb = np.array(emb).flatten()
        query_feature_vectors.append(np_emb)
        ctr += 1
    print("# images in cat", category, "=", ctr)

np.savez("feature-dump.npz", emb_training = training_feature_vectors, emb_query = query_feature_vectors)

Splitting done
DONE
Starting category # 1
# images in cat 1 = 98


In [28]:
print(query_paths[1])

[['./datasets/caltech256/001.ak47/001_0073.jpg', 1], ['./datasets/caltech256/001.ak47/001_0049.jpg', 1], ['./datasets/caltech256/001.ak47/001_0082.jpg', 1], ['./datasets/caltech256/001.ak47/001_0012.jpg', 1], ['./datasets/caltech256/001.ak47/001_0046.jpg', 1], ['./datasets/caltech256/001.ak47/001_0095.jpg', 1], ['./datasets/caltech256/001.ak47/001_0010.jpg', 1], ['./datasets/caltech256/001.ak47/001_0067.jpg', 1], ['./datasets/caltech256/001.ak47/001_0094.jpg', 1]]


In [29]:
print(len(training_feature_vectors))
print(len(training_feature_vectors[0]))
print(training_paths[1][0])
from keras.preprocessing import image
img = image.load_img(training_paths[1][0][0], target_size=(224, 224))

print("here")
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)
emb = intermediate_layer_model.predict(img_data)
np_emb = np.array(emb).flatten()
print(np_emb[0], np_emb[2047])
print(training_feature_vectors[0][0], training_feature_vectors[0][2047])

np.savez("true-classes.npz", tr_paths = training_paths, qr_paths = query_paths)
f=np.load("true-classes.npz")
print(f.files)
p = f['qr_paths']
print(len(p))

89
2048
['./datasets/caltech256/001.ak47/001_0044.jpg', 1]
here
0.45707077 6.4322433
0.45707077 6.4322433
['tr_paths', 'qr_paths']


ValueError: ignored

In [41]:
!ls
!cp true-classes.npz drive/My\ Drive/

datasets  drive  extracted.txt	feature-dump.npz  sample_data  true-classes.npz


In [0]:
!ls
!mkdir drive/My\ Drive/dumps
!cp feature-dump.npz drive/My\ Drive/dumps/resnet50-pretrained-np-dump.npz
!cp feature-dump.npz drive/My\ Drive/

datasets  drive  extracted.txt	feature-dump.npz  sample_data
mkdir: cannot create directory ‘drive/My Drive/dumps’: File exists


In [0]:
!ls drive/My\ Drive/dumps

resnet50-pretrained-np-dump.npz
