# Prediction (with unannotated test set)

After training, the model can be used as a classifier on an unknown/unannotated dataset.

Suppose we have the following directory structure. Data from this experiment was not shown to the model during training. Images are saved as NPY files:

    /data/parsed/
        Experiment 003/
            Day 1/
                Sample A/
                    Replicate 1/
                        Unknown class/
                            B__3618e715e62a229aa78a7e373b49b888.npy
                            B__3cf53cea7f4db1cfd101e06c366c9868.npy
                            B__84949e1eba7802b00d4a1755fa9af15e.npy
                            B__852a1edbf5729fe8721e9e5404a8ad20.npy
        ...

# User's settings

In [None]:
input_dir = '/Data/STEP1_Preprocessing'
modellocation = 'Data/STEP2_Model_training'
output_dir = '/Data/STEP3_Evaluation'

# Some hyperparameter
n_samples = None # sub-sampling for over-representing classes

Re-call how many classes there are during the training session. It is crucial to retrieve the list of possible classficiation targets from **the model training session** to ensure the correct reconstruction of categorization, since the training materials should contain all the categories the model has been exposed to. E.g. there could be a situation that one or some categories are missing in a testing dataset.

In [None]:
input_dir_for_model_training = '/Data/STEP1_Preprocessing'

import glob, os, re
from itertools import groupby
all_subdirs = [x[0] for x in os.walk(input_dir_for_model_training)]
list1 = sorted(list(set([os.path.basename(i.lower()) for i in all_subdirs[1:]])))
keyf = lambda text: re.split('\s|(?<!\d)[,._-]|[,._-](?!\d)', text)[0]
sorted([sorted(list(items)) for gr, items in groupby(list1, key=keyf)])

In [None]:
# Copy a list from the above output
labels_of_interest = ['class_crenateddisc_', 'class_crenateddiscoid', 'class_crenatedsphere', 'class_crenatedspheroid', 'class_side', 'class_smoothdisc', 'class_smoothsphere']

# Executable

In [None]:
%matplotlib inline

import keras
import matplotlib.pyplot as plt
import numpy
import pandas
import seaborn
import sklearn.metrics
import tensorflow

import deepometry.model
import deepometry.utils

In [None]:
# build session running on GPU 1
configuration = tensorflow.ConfigProto()
configuration.gpu_options.allow_growth = True
# configuration.gpu_options.visible_device_list = "0"
session = tensorflow.Session(config = configuration)

# apply session
keras.backend.set_session(session)

In [None]:
pathnames_of_interest = deepometry.utils.collect_pathnames(input_dir, labels_of_interest, n_samples=None)

In [None]:
x, _, _ = deepometry.utils._load(pathnames_of_interest, labels_of_interest)

units = len(list(set(labels_of_interest)))

In [None]:
model = deepometry.model.Model(shape=x.shape[1:], units=units)

model.compile()

predicted = model.predict(x, modellocation, batch_size=32, verbose=1)

predicted = numpy.argmax(predicted, -1)

In [None]:
predicted_classes = pandas.DataFrame()
predicted_classes['numeric_class'] = predicted
predicted_classes['label'] = [labels_of_interest[i] for i in predicted]
predicted_classes.to_csv(os.path.join(output_dir,'predicted.csv'), index=True, index_label='ID')

# Simple count plot:
plt.figure(figsize = (9, 6))
seaborn.countplot(x="label", data=predicted_classes)
plt.xticks(rotation=60)    