# Feature extraction

Suppose we have the following directory structure. Data from this experiment was shown to the model during training. Images are saved as NPY files:

    /data/parsed/
        Experiment 001/
            Day 1/
                Sample A/
                    Replicate 1/
                        Class A/
                            A__32e88e1ac3a8f44bf8f77371155553b9.npy
                            A__3dc56a0c446942aa0da170acfa922091.npy
                        Class B/
                            B__8068ef7dcddd89da4ca9740bd2ccb31e.npy
        Experiment 002/
            Day 1/
                Sample A/
                    Replicate 1/
                        Class A/
                            A__8348deaa70dfc95c46bd02984d28b873.npy
                        Class B/
                            B__c1ecbca7bd98c01c1d3293b64cd6739a.npy
                            B__c56cfb8e7e7121dd822e47c67d07e2d4.npy
        ...

# User's settings

In [None]:
input_dir = '/Data/STEP1_Parsing/'
modellocation = '/Data/STEP2_Model_training/'
output_dir = '/Data/STEP3c_Extraction/'
feature_extraction_layer = 'res5a_relu'
frame = 48 # frame size of each single-cell image

# Some hyperparameter
n_samples = None # sub-sampling for over-representing classes

# Executable

In [None]:
%matplotlib inline

import os, glob, re
import matplotlib.pyplot
import pandas
import skimage.io
import numpy

import deepometry.model
import deepometry.utils
import deepometry.visualize

import keras.backend
import tensorflow

In [None]:
def save_metadata(file, metadata_bkpr, metadata_depth):

    meta = [re.split('\s|(?<!\d)[,._-]|[,._-](?!\d)', i)[0] for i in metadata_depth]

    with open(file, 'w') as f:
        f.write( meta[0] + '\t' + meta[1] + '\t' + meta[2] + '\t' + meta[3] + '\t' + meta[4] + '\n')
        for i in range(len(metadata_bkpr)):
            f.write('{}\t{}\t{}\t{}\t{}\n'.format( metadata_bkpr[i][0], metadata_bkpr[i][1], metadata_bkpr[i][2], metadata_bkpr[i][3], metadata_bkpr[i][4]))
            
            
def split_all(path):
    """
    Break a path into unit components
    """

    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts

In [None]:
configuration = tensorflow.ConfigProto()
configuration.gpu_options.allow_growth = True
# configuration.gpu_options.visible_device_list = "0"

session = tensorflow.Session(config=configuration)
keras.backend.set_session(session)

In [None]:
pathnames_of_interest = [j for i in [x[0] for x in os.walk(input_dir)] for j in glob.glob(os.path.join(i,'*')) if '.npy' in j]

x, _, metadata_bkpr = deepometry.utils._load(pathnames_of_interest)

if os.path.isdir(modellocation):
    list_of_files = glob.glob(os.path.join(modellocation, '*_categories.h5')) # only file with correct naming, i.e.***_categories.h5, is accepted
    modellocation = max(list_of_files, key=os.path.getctime)    
units = int( re.search('model_([0-9]*)-.*_categories.h5', os.path.basename(modellocation)).group(1) )

In [None]:
model = deepometry.model.Model(shape=x.shape[1:], units=units)

model.compile()

# Extract the features
features = model.extract(x, feature_extraction_layer, trained_model_location, batch_size=32, standardize=True, verbose=1)

if len(features.shape) > 2:
    a = numpy.mean(features, axis = 1)
    del(features)
    features = numpy.mean(a, axis = 1)  

Once the data is loaded, we can redefine the model and extract the embedded features of `x`. The features will be saved as a TSV file using pandas. We omit the column headers and the data frame indexes from the exported data.

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)   

# Export features to .TXT file, to be used on http://projector.tensorflow.org
numpy.savetxt( os.path.join(output_dir, 'features_extracted_by_'+ feature_extraction_layer +'.txt'), features, delimiter='\t')

# Export features to .TSV file, to be used in local Tensorboard
features_df = pandas.DataFrame(data=features)
features_df.to_csv( os.path.join(output_dir, 'features_extracted_by_'+ feature_extraction_layer +'.tsv'), header=False, index=False, sep="\t")

Additional metadata can also be exported as a TSV. In this example, we export a single-column TSV containing the label data. Single-column metadata should exclude the column header (`header=False`). Metadata containing more than one column should include the column header (`header=True`).

In [None]:
# Save labels, to be used as "metadata" on http://projector.tensorflow.org
save_metadata(os.path.join(output_dir, 'metadata.tsv'), metadata_bkpr, metadata_depth=split_all(pathnames_of_interest[0])[-6:-1])

# Visualize

Visualize an embedded feature spacing using TensorBoard or on the web-app http://projector.tensorflow.org. Use the outputs from `extract` to populate the features, metadata, and sprites parameters for `deepometry.visualize.make_projection`.

When using TensorBoard to visualize an embedding, you can display a "sprite" image for each data point. The sprite image is an NxN grid of image data. Use `deepometry.visualize.images_to_sprites` to create this grid. The `sprites_dim` parameter tells TensorBoard how big (in pixels) each sprite is. For example, if a single sprit is 48x48 pixels, then `sprites_dim=48`.

To launch TensorBoard, copy the output of the cell below and run it in your terminal.

In [None]:
# Create the sprites image.
sprites = deepometry.visualize.images_to_sprite(x)

# Export the sprites image.
skimage.io.imsave(os.path.join(output_dir,'sprites.png'), sprites)

In [None]:
features_file = os.path.abspath(glob.glob(os.path.join(output_dir,'*feature*.tsv'))[0])
metadata_file = os.path.abspath(os.path.join(output_dir,'metadata.tsv'))
sprites_file = os.path.abspath(os.path.join(output_dir,'sprites.png'))
sprites_dim = frame

log_directory = deepometry.visualize.make_projection(
    features_file,
    metadata=metadata_file,
    sprites=sprites_file,
    sprites_dim=frame
)

# Copy and execute this command on the terminal
print("tensorboard --logdir {:s}".format(log_directory))