In [None]:
import numpy as np
import keras
import pickle as pk
import matplotlib.pyplot as plt
import os
from keras.applications.resnet50 import ResNet50
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input
from keras.layers import UpSampling2D, Input
from keras.models import Model

In [None]:
# Define model
inp = Input(shape=(32, 32, 3), name='image_input')
x = UpSampling2D(size =(7,7))(inp)
resnet = ResNet50(weights='imagenet', include_top=True)

resnet.layers.pop()
resnet.outputs = [resnet.layers[-1].output]
resnet.layers[-1].outbound_nodes = []
resnet.summary()
output = resnet(x)

model = Model(inputs=inp, outputs=output)
model.summary()

In [None]:
# Function to load a batch into memory
def load_batch(data_dir, batch_id):
    with open(os.path.join(data_dir, 'data_batch_%i' % batch_id), mode='rb') as file:
        batch = pk.load(file, encoding='latin1')
    feats = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    lbls = batch['labels']
    return feats, lbls

# Load the first batch
data_dir = '../data/'
feats, labels = load_batch(data_dir + 'cifar10', 1)
label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
# Visualize images
sample_id = 19
sample_img = feats[sample_id]
sample_lbl = labels[sample_id]
print('Label Id: {} - Class: {}'.format(sample_lbl, label_names[sample_lbl]))
plt.imshow(sample_img)

In [None]:
sample_img = preprocess_input(sample_img)

In [None]:
sample_size = 3000
sample_feats = feats[:sample_size,:,:,:]
sample_labels = labels[:sample_size]

In [None]:
import time
import json
comp_embeddings = False
dump_embeddings = False
if comp_embeddings:
    start = time.time()
    embeddings = model.predict(feats[:sample_size,:,:,:])
    print(time.time()-start)
    
if dump_embeddings:
    with open(f'embeddings/embeddings_{sample_size}.json', 'w') as outfile:
        json.dump(embeddings.tolist(), outfile)


In [None]:
def load_from_json(file:str):
    return json.loads(open(f'embeddings/{file}').read())

In [None]:
from sklearn.preprocessing import scale
def split_set(embeddings, labels, sample_size):
    n_embeddings= scale(embeddings)
    indices = np.random.permutation(sample_size)
    training_idx, dev_idx,test_idx = indices[:int(0.6*sample_size)], indices[int(0.6*sample_size):int(0.8*sample_size)],indices[int(0.8*sample_size):]
    feats_train,feats_dev, feats_test = n_embeddings[training_idx,:],n_embeddings[dev_idx,:], n_embeddings[test_idx,:]
    labels_train,labels_dev, labels_test = [np.array(labels[i]) for i in training_idx],[np.array(labels[i]) for i in dev_idx], [np.array(labels[i]) for i in test_idx]
    return [feats_train,feats_dev, feats_test],[labels_train,labels_dev, labels_test]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
def train_knn_and_test(train,dev,labels_train,labels_dev):
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(train,labels_train)
    return knn,knn.score(train, labels_train), knn.score(dev, labels_dev)

In [None]:
from sklearn.decomposition import PCA
def train_pca(train_set, percentage=90):
    for i in range(50,min(sample_size,4096),50):
        pca = PCA(n_components=i)
        pca.fit(train_set)
        if np.sum(100*pca.explained_variance_ratio_) > percentage:
            return i,pca

In [None]:
import pandas as pd

In [None]:
from os import listdir

for file in listdir('./embeddings'):
    if '.json' in file:
        df = pd.DataFrame(columns=['PCA','Train accuracy', 'Dev accuracy'])
        sample_size = int(file[file.index('_')+1:file.index('.')])
        embeddings = load_from_json(file)
        
        # Spit the data
        [feats_train,feats_dev, feats_test],[labels_train,labels_dev, labels_test] = split_set(
            embeddings, labels[:sample_size], sample_size)

        # Train knn
        knn, knn_train_score, knn_test_score = train_knn_and_test(feats_train,feats_dev,labels_train,labels_dev)
        
        df.loc[0] = [0,knn_train_score,knn_test_score ]
        
        # Fit pca
        i,pca = train_pca(feats_train)
        print(f'Trained pca with {i} components')
        
        # Train knn
        knn, knn_train_score, knn_test_score = train_knn_and_test(pca.transform(feats_train),pca.transform(feats_dev),labels_train,labels_dev)
        df.loc[1] = [1,knn_train_score,knn_test_score]
        print(df)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(knn.predict(pca.transform(feats_dev)), labels_dev)
#['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
# Time to compute 100 embeddings
nb_samples = 100
start = time.time()
embeddings2 = model.predict(feats[:nb_samples,:,:,:])
print(time.time()-start)