# Feature extraction and post-extraction EDA

## Choice of model
The [Inception v3](https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1) and [MobileNet v2](https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2) are both up to the task, their TF Hub implementations have been trained on the ILSVRC-2012-CLS "ImageNet" data set, and have the [same signature for feature vectors](https://www.tensorflow.org/hub/common_signatures/images#feature-vector).

The MobileNet v2 is optimized for mobile applications. Since I'm not building a mobile application, I chose the Inception v3 model. This model has more parameters, expects slightly larger input images. 

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import tensorflow as tf
import tensorflow_hub as hub

print('Tensorflow version: {}, TF hub version: {}'.format(tf.__version__, hub.__version__))



Tensorflow version: 1.12.0, TF hub version: 0.4.0


## Create a network
Create a network by loading the chosen pre-trained model from TF Hub.

In [2]:
img_graph = tf.Graph()

with img_graph.as_default():
    # pretrained network Inception v3
    module_url = 'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1'
    feature_extractor = hub.Module(module_url)
    
    # expected size of input images
    height, width = hub.get_expected_image_size(feature_extractor)
    
    # placeholder for input
    input_imgs = tf.placeholder(dtype=tf.float32, shape=[None, height, width, 3])
    
    # node that represents extracted high-level features
    imgs_features = feature_extractor(input_imgs)
    
    # initializers required by TensorFlow Hub
    init_op = tf.group(
        [tf.global_variables_initializer(), tf.tables_initializer()]
    )

img_graph.finalize() # make graph read-only

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


### Implement a function to load data from point clouds

In [5]:
import numpy as np
import os

TF_SIZE = 299

def convert_images(data):
    """
    Standardize and convert the image data
    @param np_image_array 
    """
    X = []
    for idx in range(len(data)):
        flat_img = data[idx, :] # get some image
        flat_img = flat_img/255  # standardize values to 0-1
        img = flat_img.reshape(299,299, 3) # sqrt 784
        X.append(img)
    return (img for img in X)

def load_point_cloud(dataset_name):
    """
    Load point cloud data
    @param dataset_name string 'train', 'test' or 'validate'
    @returns numpy.ndarray metadata, numpy.ndarray 299*299 3-channel flattened Images elevation, plan, section
    """
    with np.load(os.path.join('data', dataset_name, 'point_clouds.npz'), allow_pickle=True) as data:
        data_dict = dict(data.items())

    metadata = data_dict['metadata']
    elevation = convert_images(data_dict['elevation'])
    plan = convert_images(data_dict['plan'])
    section = convert_images(data_dict['section'])
    return metadata, elevation, plan, section

### Implement a function to load textures

In [4]:
def load_textures(dataset_name):
    """
    Load texture data
    @param dataset_name string 'train', 'test' or 'validate'
    @returns numpy.ndarray metadata, numpy.ndarray 299*299 3-channel flattened Images
    """
    with np.load(os.path.join('data', dataset_name, 'textures.npz'), allow_pickle=True) as data:
        data_dict = dict(data.items())

    metadata = data_dict['metadata']
    textures = data_dict['texture']
    return metadata, textures

In [79]:
# Load the validation dataset
_, textures = load_textures('validate')

### Extract features
Extract and save the features from point cloud data for each of the train, validate and test datasets.

In [9]:
PROD_DIR = 'production' # interim production files

sess = tf.Session(graph=img_graph)
sess.run(init_op)

def extract_features(images):
    """
    Extracts high-level features from images.
    @param images image generator
    @return list
    """
    features = []
    for img in images:
        features.append(
            sess.run(
                imgs_features,
                feed_dict = {
                    input_imgs: [img]
                }
            )
        )
    return features

def save_dataset_features(dataset_name):
    """
    Saves features extracted from a dataset as npz file
    @param dataset_name 'train', 'validate' or 'test'
    """
    metadata, elevation, plan, section = load_point_cloud(dataset_name)
    elevation_features = extract_features(elevation)
    plan_features = extract_features(plan)
    section_features = extract_features(section)

    np.savez_compressed(os.path.join(PROD_DIR, 'features_{}.npz'.format(dataset_name)), 
            metadata = metadata,
            elevation_features = elevation_features,
            plan_features = plan_features,
            section_features = section_features    
            )

save_dataset_features('train')
save_dataset_features('validate')
save_dataset_features('test')
