# Injecting extra samples in pipeline simple protocols

Sometimes our experiments go beyond "simple" database protocols.
Sometimes we just want to analyze the impact of some extra samples in our experiments without writing a whole dataset intergace for that.

This notebook shows how to "inject" samples that doesn't belong to any protocol to some existing protocol.
We'll show case how to inject samples to perform score normalization.

## Preparing the database

We'll show case how to perform this injection using the MEDS dataset.

In [8]:
dask_client = None

OUTPUT_PATH = "YOUR_TEMP"
PATH_INJECTED_DATA = "/idiap/temp/parzul/db_gen_output/database_neutralized/image/00000/"


##### CHANGE YOUR DATABASE HERE
from bob.bio.face.database import MEDSDatabase

database = MEDSDatabase(protocol="verification_fold1")

# Fetching the keys
#references = database.zprobes()[0].references
references = database.probes(group="eval")[0].references + database.probes(group="dev")[0].references


# Loading samples that will be injected

Here we'll inject samples for znorm and tnorm

In [2]:
# PATH
import os
import functools
import bob.io.base
# Fetching real data
#treferences = database.treferences()
#zprobes = database.zprobes()

eyes_annotations={'leye': (61, 120),
                  'reye': (61, 63)}


treferences_lst = ["0/0_ethnicity_0.png",
                   "0/0_ethnicity_7.png"]

zprobes_lst = ["1/1_ethnicity_0.png",
               "1/1_ethnicity_7.png"]

from bob.pipelines import Sample, DelayedSample, SampleSet

# Converting every element in a list in a sample set
def list_to_sampleset(lst, base_path, eyes_annotations, references):
    sample_sets = []
    for i,l in enumerate(lst):
        sample = DelayedSample(functools.partial(bob.io.base.load,os.path.join(base_path,l)),
                               key=l,
                               reference_id=str(i),
                               annotations=eyes_annotations
                                )
        sset = SampleSet(samples=[sample],
                         key=l,
                         reference_id=str(i),
                         references=references)

        sample_sets.append(sset)
    return sample_sets


treferences = list_to_sampleset(treferences_lst, PATH_INJECTED_DATA,eyes_annotations, references=None)
zprobes = list_to_sampleset(zprobes_lst, PATH_INJECTED_DATA, eyes_annotations, references=references)



## Preparing the pipeline

Here we are using the arcface from insight face (https://github.com/deepinsight/insightface).
Feel free to change it by looking at (`bob.bio.face.embeddings`).

In [3]:
import os
from bob.bio.base.pipelines import checkpoint_pipeline_simple
from bob.bio.base.pipelines import dask_pipeline_simple
from bob.bio.base.pipelines import ZTNormPipeline, ZTNormCheckpointWrapper
from bob.bio.base.pipelines import CSVScoreWriter

from bob.bio.face.embeddings.mxnet import arcface_insightFace_lresnet100
pipeline = arcface_insightFace_lresnet100(annotation_type=database.annotation_type,
                                          fixed_positions=None,
                                          memory_demanding=False)


## SCORE WRITER
# Here we want the pipeline to write using METADATA
pipeline.score_writer = CSVScoreWriter(os.path.join(OUTPUT_PATH, "./tmp"))


# Agregating with checkpoint
pipeline = checkpoint_pipeline_simple(pipeline, OUTPUT_PATH)


#pipeline = dask_pipeline_simple(ZTNormCheckpointWrapper(ZTNormPipeline(pipeline), OUTPUT_PATH))
# AGGREGATING WITH ZTNORM
pipeline = ZTNormPipeline(pipeline)
pipeline.ztnorm_solver = ZTNormCheckpointWrapper(
    pipeline.ztnorm_solver, os.path.join(OUTPUT_PATH, "normed-scores")
)
pipeline = dask_pipeline_simple(pipeline, partition_size=200)

print(pipeline.transformer)

Pipeline(steps=[('ToDaskBag', ToDaskBag(partition_size=200)),
                ('cropper',
                 DaskWrapper(estimator=CheckpointWrapper(estimator=SampleWrapper(estimator=FaceCrop(annotator=MTCNN(),
                                                                                                    cropped_image_size=(112,
                                                                                                                        112),
                                                                                                    cropped_positions={'leye': (55,
                                                                                                                                81),
                                                                                                                       'reye': (55,
                                                                                                                                42)}),
        

## Setting the DASK client (optional step; do it if you want to use the grid)

**HERE MAKE ABSOLUTELLY SURE THAT YOU DO `SETSHELL grid`  BEFORE STARTING THE NOTEBOOK**



In [4]:
from dask.distributed import Client
from bob.pipelines.distributed.sge import SGEMultipleQueuesCluster

cluster = SGEMultipleQueuesCluster(min_jobs=1)
dask_client = Client(cluster)

As an example, we consider 10 samples from this database and extract features for these samples:

## Running the PipelineSimple

In [5]:
import os
def post_process_scores(pipeline, scores, path):
    written_scores = pipeline.write_scores(scores)
    return pipeline.post_process(written_scores, path)    

def _build_filename(score_file_name, suffix):
    return os.path.join(score_file_name, suffix)

from dask.delayed import Delayed
import dask.bag
def compute_scores(result, dask_client):
    if isinstance(result, Delayed) or isinstance(result, dask.bag.Bag):
        if dask_client is not None:
            result = result.compute(scheduler=dask_client)
        else:
            print("`dask_client` not set. Your pipeline will run locally")
            result = result.compute(scheduler="single-threaded")
    return result

background_model_samples = database.background_model_samples()
for group in ["dev","eval"]:    

    score_file_name = os.path.join(OUTPUT_PATH, f"scores-{group}")
    biometric_references = database.references(group=group)
    probes = database.probes(group=group)
    
    (
        raw_scores,
        z_normed_scores,
        t_normed_scores,
        zt_normed_scores,
        s_normed_scores,
    ) = pipeline(
        background_model_samples,
        biometric_references,
        probes,
        zprobes,
        treferences,
        score_all_vs_all=True,
    )        
    
    
    

    # Running RAW_SCORES

    raw_scores = post_process_scores(
        pipeline, raw_scores, _build_filename(score_file_name, "raw_scores")
    )
    _ = compute_scores(raw_scores, dask_client)

    # Z-SCORES
    z_normed_scores = post_process_scores(
        pipeline,
        z_normed_scores,
        _build_filename(score_file_name, "z_normed_scores"),
    )
    _ = compute_scores(z_normed_scores, dask_client)

    # T-SCORES
    t_normed_scores = post_process_scores(
        pipeline,
        t_normed_scores,
        _build_filename(score_file_name, "t_normed_scores"),
    )
    _ = compute_scores(t_normed_scores, dask_client)

    # S-SCORES
    s_normed_scores = post_process_scores(
        pipeline,
        s_normed_scores,
        _build_filename(score_file_name, "s_normed_scores"),
    )
    _ = compute_scores(s_normed_scores, dask_client)

    # ZT-SCORES
    zt_normed_scores = post_process_scores(
        pipeline,
        zt_normed_scores,
        _build_filename(score_file_name, "zt_normed_scores"),
    )
    _ = compute_scores(zt_normed_scores, dask_client)



In the following cells, we convert the extracted features to `numpy.array` and check the size of features.

In [6]:
# KILL THE SGE WORKERS
dask_client.shutdown()