# <center>Feature extraction</center>

In [1]:
import tensorflow as tf
from tensorflow.keras import Sequential, backend as K
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow.keras as keras
import tensorflow_hub as hub
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import seaborn as sns
%matplotlib inline
sns.set()
print("TF version:", tf.__version__)
print("Hub version:", hub.__version__)
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")



TF version: 1.12.0
Hub version: 0.4.0
GPU is NOT AVAILABLE


## Feature extraction
In this first part of the project, start by extracting a set of high-level features for each
image in the data set. To achieve this, you can use ex. the **Inception v3** or **MobileNet v2**
ConvNets which respectively extract 2048 and 1280 high-level features.

**Suggestion**: consider storing the extracted high-level features, e.g. in npz files, for
quickly reloading them into each of the following notebooks.

**Note**: All your models should be trained on the training set, and the fine tuning of your
hyperparameters should be validated on the validation set. The final test set should only
be used for the final comparison to test the accuracies of your models on a new dataset.

In [2]:
!rm -rf data
!unzip -qq swissroads.zip -d data

There is only 280 images in the train directory. the sample size is insufficient to train our models.

As a solution, we will use, only for training set, data augmentation to generate 5 times additional images from the available ones using keras ImageDataGenerator.

Four numpy arrays will be generated and stored in data.npz :
    - X_train : a numpy array of shape (1400, 299, 299, 3) which stores the training images.
    - X_train_features : a numpy array of shape (1400, 2048) which stores the inception_v3 feature vectors for the training images.
    - X_train_1h : The one hot representation of target values
    - X_train_1h : Target values

Same thing for validation and test sets.

In [3]:
## initialize tensorflow hun inception_V3 module
#module_url = 'https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2'
module_url = 'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1'
module = hub.Module(module_url)
targetsize = hub.get_expected_image_size(module)

## create tensorflow graph
img_graph = tf.Graph()
with img_graph.as_default():
    feature_extractor = hub.Module(module_url)
    input_imgs = tf.placeholder(dtype=tf.float32, shape=[None, targetsize[0], targetsize[1], 3])
    imgs_features = feature_extractor(input_imgs)
    init_op = tf.group([ tf.global_variables_initializer(), tf.tables_initializer() ])
img_graph.finalize()

## get features from images array X
def get_features(X):
    with tf.Session(graph=img_graph) as sess:
        sess.run(init_op)
        features = sess.run(imgs_features, feed_dict={input_imgs: X})
    return features

## process a generator to extract features from all files in train, validation or test set
def process_generator(g,train=False):
    np.random.seed(0)
    tf.set_random_seed(0)
    n=1
    if train:
       n=4
    features = []
    data = []
    y_1h = []
    # for each batch in generator
    for i, (X,y) in enumerate(g):
        ## break when we have processed all batches.
        ## otherwise, generator loop indefinitely
        if i+1 > g.__len__()*n:
            break
        data.append(X)
        y_1h.append(y)
        features.append(get_features(X))
    return np.concatenate(features), np.concatenate(data), np.concatenate(y_1h)
# generate data.npz which contains images arrays and features arrays.
def save_features_from_images():
    np.random.seed(0)
    tf.set_random_seed(0)
    train_generator1 = ImageDataGenerator(rescale=1/255)
    train_generator2 = ImageDataGenerator(rescale=1/255,rotation_range=40,width_shift_range=0.2,height_shift_range=0.2,shear_range=0.2,zoom_range=0.2,horizontal_flip=True,fill_mode='nearest')
    valid_generator = ImageDataGenerator(rescale=1/255)
    test_generator = ImageDataGenerator(rescale=1/255)
    trainset1 = train_generator1.flow_from_directory(os.path.join('data', 'train'), batch_size=140, target_size=targetsize, shuffle=False)
    trainset2 = train_generator2.flow_from_directory(os.path.join('data', 'train'), batch_size=140, target_size=targetsize, shuffle=False)
    validset = valid_generator.flow_from_directory(os.path.join('data', 'valid'), batch_size=140, target_size=targetsize, shuffle=False)
    testset = test_generator.flow_from_directory(os.path.join('data', 'test'), batch_size=140, target_size=targetsize, shuffle=False)
    X_test_features, X_test, y_test_1h = process_generator(testset)
    print(X_test_features.shape, X_test.shape, y_test_1h.shape)
    X_valid_features, X_valid, y_valid_1h = process_generator(validset)
    print(X_valid_features.shape, X_valid.shape, y_valid_1h.shape)
    # original train images
    X_train_features1, X_train1, y_train_1h1 = process_generator(trainset1)
    print(X_train_features1.shape, X_train1.shape, y_train_1h1.shape)
    # augmented train images
    X_train_features2, X_train2, y_train_1h2 = process_generator(trainset2, True)
    print(X_train_features2.shape, X_train2.shape, y_train_1h2.shape)
    # cmbine the two sets, and shuffle them
    idx = list(range(X_train_features1.shape[0]+X_train_features2.shape[0]))
    np.random.shuffle(idx)
    X_train_features = np.concatenate([X_train_features1,X_train_features2])[idx]
    X_train = np.concatenate([X_train1,X_train2])[idx]
    y_train_1h = np.concatenate([y_train_1h1,y_train_1h2])[idx]
    np.savez('data.npz', X_train=X_train, X_train_features=X_train_features, y_train_1h=y_train_1h, y_train=np.argmax(y_train_1h, axis=1),
                     X_valid=X_valid, X_valid_features=X_valid_features, y_valid_1h=y_valid_1h, y_valid=np.argmax(y_valid_1h, axis=1),
                     X_test=X_test, X_test_features=X_test_features, y_test_1h=y_test_1h, y_test=np.argmax(y_test_1h, axis=1),
                     class_indices=list(trainset1.class_indices.keys()), 
                     train_filenames=trainset1.filenames, valid_filenames=validset.filenames, test_filenames=testset.filenames
                    )
## load data from data.npz
def load_data():
    with np.load('data.npz', allow_pickle=True) as npz_file:
        X_train = npz_file['X_train']
        X_valid = npz_file['X_valid']
        X_test = npz_file['X_test']
        X_train_features = npz_file['X_train_features']
        X_valid_features = npz_file['X_valid_features']
        X_test_features = npz_file['X_test_features']
        y_train_1h = npz_file['y_train_1h']
        y_valid_1h = npz_file['y_valid_1h']
        y_test_1h = npz_file['y_test_1h']
        y_train = npz_file['y_train']
        y_valid = npz_file['y_valid']
        y_test = npz_file['y_test']
        class_indices = npz_file['class_indices']
        train_filenames = npz_file['train_filenames']
        valid_filenames = npz_file['valid_filenames']
        test_filenames = npz_file['test_filenames']
    return X_train,X_train_features, y_train_1h, y_train, train_filenames, X_valid,X_valid_features, y_valid_1h, y_valid, valid_filenames, X_test, X_test_features, y_test_1h, y_test, test_filenames,class_indices

test_results={}

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


We will load all images from directory, generate features vectors and store all data in data.npz file.

In [4]:
save_features_from_images()

Found 280 images belonging to 6 classes.
Found 280 images belonging to 6 classes.
Found 139 images belonging to 6 classes.
Found 50 images belonging to 6 classes.
(50, 2048) (50, 299, 299, 3) (50, 6)
(139, 2048) (139, 299, 299, 3) (139, 6)
(280, 2048) (280, 299, 299, 3) (280, 6)
(1120, 2048) (1120, 299, 299, 3) (1120, 6)


We will test data loading here.

In [5]:
X_train,X_train_features, y_train_1h, y_train, train_filenames, X_valid,X_valid_features, y_valid_1h, y_valid, valid_filenames, X_test, X_test_features, y_test_1h, y_test, test_filenames,class_indices = load_data()

In [6]:
print ('Images [ Train : %s, Valid : %s, Test %s ]'%(X_train.shape, X_valid.shape,X_test.shape))

Images [ Train : (1400, 299, 299, 3), Valid : (139, 299, 299, 3), Test (50, 299, 299, 3) ]


In [7]:
print ('Features [ Train : %s, Valid : %s, Test %s ]'%(X_train_features.shape, X_valid_features.shape,X_test_features.shape))

Features [ Train : (1400, 2048), Valid : (139, 2048), Test (50, 2048) ]
