# Input Validation

This python notebook is intended to allow you to quickly validate the inputs for a Joint Call Set.
To run it:

Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process

The notebook will validate that:
- the sample set that you have listed is found
- there are no duplicate samples in the sample set
- there are no empty sample names in the sample set
- each sample has a corresponding reblocked_gvcf index

In order to help determine which column the reblocked_gvcf and its index are stored in, the notebook will look for a defined column name among a predefined list of such column names (see `reblocked_gvcf_fields` and `reblocked_gvcf_index_fields` below). The script expects to find one of those column names in the sample table. If you are using column names that are NOT in those lists, they will need to be added.


In [33]:
sample_set_id = "gvs_demo_10"


In [34]:
import os
import os.path
import math

from firecloud import api as fapi
from tqdm import tqdm


In [35]:
def get_field_name(possible_field_names, attribute_names, type_string):
    error_seen = False
    
    field_names_found = set()
    for field_name in possible_field_names:
        if (field_name in attribute_names):
            field_names_found.add(field_name)

    field_name = None
    if (len(field_names_found) > 0):
        if (len(field_names_found) == 1):
            field_name = list(field_names_found)[0]
        else:
            error_seen = True
            print(f"ERROR: There are multiple columns in the 'sample' datatable {str(field_names_found)} that potentially contain reblocked gvcfs")
    else:
        error_seen = True
        print(f"ERROR: No column for {type_string} in the 'sample' datatable")
    return field_name, error_seen


In [36]:
ws_project = os.environ['WORKSPACE_NAMESPACE']
ws_name = os.environ['WORKSPACE_NAME']

print("Validating inputs for: ")
print("Project: " + ws_project)
print("Workspace: " + ws_name)

errors_seen = False

# This is a list of all of the *possible* field names for reblocked gvcfs and their corresponding indices
reblocked_gvcf_fields = ['reblocked_gvcf', 
                         'reblocked_gvcf_path',
                         'hg38_reblocked_gvcf']
reblocked_gvcf_index_fields = [
                         'reblocked_gvcf_index',
                         'reblocked_gvcf_index_path',
                         'hg38_reblocked_gvcf_index']


entity_types = fapi.list_entity_types(ws_project, ws_name).json()
if (("sample" not in entity_types) or ("sample_set" not in entity_types)):
    errors_seen = True
    print(f"ERROR: Not all expected entities (sample, sample_set) were found in workspace")

if (not errors_seen):
    sample_set = fapi.get_entity(ws_project, ws_name, "sample_set", sample_set_id).json()
    if ("attributes" not in sample_set):
        errors_seen = True
        error_message = sample_set["message"]
        print(f"ERROR: Looking up {sample_set_id}: {error_message}")
    
if (not errors_seen):
    samples_in_sample_set = set()
    samples_dupes = set()
    
    attributes = sample_set["attributes"]
    for entity in sample_set['attributes']['samples']['items']:
        sample_id = entity['entityName']

        if sample_id in samples_in_sample_set:
            samples_dupes.add(sample_id)
        else:
            samples_in_sample_set.add(sample_id)    

    # Are there any empty sample_ids?
    if ('' in samples_in_sample_set):
        errors_seen = True
        samples_in_sample_set.delete('')
        print("ERROR: sample_id set to an empty string.")

    # Are all the sample names unique?
    if (len(samples_dupes) > 0):
        errors_seen = True
        print(f"ERROR: Found {str(len(samples_dupes))}  duplicate sample_ids: ")
        print(list(dict.fromkeys(samples_dupes)))

    print(f"Found {str(len(samples_in_sample_set))} samples in sample_set '{sample_set_id}'")

# Inspect samples table - determine possible names for reblocked_gvcfs.
etype = 'sample'
entity_types = fapi.list_entity_types(ws_project, ws_name).json()

if (not errors_seen):
    # Determine if there are more than one data columns for reblocked_gvcf and its index
    attribute_names = entity_types[etype]["attributeNames"]

    # Inspect sample table - determine possible names for reblocked_gvcfs.
    gvcf_field, error_seen = get_field_name(reblocked_gvcf_fields, attribute_names, "reblocked gvcf")
    if (error_seen):
        errors_seen = True

    # Inspect sample table - determine possible names for reblocked_gvcf indices.
    gvcf_index_field, error_seen = get_field_name(reblocked_gvcf_index_fields, attribute_names, "reblocked gvcf index")
    if (error_seen):
        errors_seen = True
    
if (not errors_seen):
    entity_count = entity_types[etype]["count"]

    page_size = 1000
    num_pages = int(math.ceil(float(entity_count) / page_size))

    # get entities by page where each page has page_size # of rows using API call
    for page in tqdm(range(1, num_pages + 1)):
        page_of_entitites = fapi.get_entities_query(ws_project, ws_name, etype, page=page,
                                           page_size=page_size).json()

        for entity in page_of_entitites['results']:
            sample_id = entity['name']

            if (sample_id in samples_in_sample_set):
                error_seen_for_sample = False   # No error seen for this sample.
                reblocked_gvcf = None
                reblocked_gvcf_index = None

                if (gvcf_field in entity['attributes']):
                    reblocked_gvcf = entity['attributes'][gvcf_field]
                if (gvcf_index_field in entity['attributes']):
                    reblocked_gvcf_index = entity['attributes'][gvcf_index_field]

                if (reblocked_gvcf is not None) and (reblocked_gvcf_index is not None):
                    reblocked_gvcf_name = os.path.basename(reblocked_gvcf)
                    reblocked_gvcf_name_and_ext = os.path.splitext(reblocked_gvcf_name)
                    if (reblocked_gvcf_name_and_ext[1] == ".gz"):
                        expected_reblocked_gvcf_index_name = reblocked_gvcf_name + ".tbi"
                    elif (reblocked_gvcf_name_and_ext[1] == ".vcf"):
                        expected_reblocked_gvcf_index_name = reblocked_gvcf_name + ".idx"
                    else:
                        error_seen_for_sample = True
                        print(f"ERROR: Unrecognized extension \"{reblocked_gvcf_name_and_ext[1]}\" for {gvcf_field}: {reblocked_gvcf_name}")
                        
                    if (not error_seen_for_sample):
                        reblocked_gvcf_index_name = os.path.basename(reblocked_gvcf_index)
                        if (reblocked_gvcf_index_name != expected_reblocked_gvcf_index_name):
                            errors_seen = True
                            print(f"ERROR: The {gvcf_index_field} found \"{reblocked_gvcf_index_name}\" does not match that expected: \"{expected_reblocked_gvcf_index_name}\"")
                    else:
                        errors_seen = True
                else:
                    errors_seen = True
                    if (reblocked_gvcf is None):
                        print(f"ERROR: reblocked_gvcf not found for sample_id: {sample_id}")
                    if (reblocked_gvcf_index is None):
                        print(f"ERROR: reblocked_gvcf_index not found for sample_id: {sample_id}")


if (errors_seen):
    print("\nErrors were seen - The inputs have not been validated\n")
else:
    print("Successfully Validated GVS Inputs")
    print("Validated that all samples have non-empty, and unique names")
    print("Validated that reblocked gvcfs and indices were found in the data model")
    print("Validated that all samples' reblocked gvcfs have corresponding indices\n")

if (gvcf_field is not None):
    print(f"FYI: The name of the column in the datamodel that contains the reblocked gvcfs is: {gvcf_field}")
if (gvcf_index_field is not None):
    print(f"FYI: The name of the column in the datamodel that contains the reblocked gvcf indices is: {gvcf_index_field}")




Validating inputs for: 
Project: warp-pipelines
Workspace: ggrant - GVS Quickstart V2 copy
Found 10 samples in sample_set 'gvs_demo_10'


100%|██████████| 1/1 [00:00<00:00, 14.19it/s]

Successfully Validated GVS Inputs
Validated that all samples have non-empty, and unique names
Validated that reblocked gvcfs and indices were found in the data model
Validated that all samples' reblocked gvcfs have corresponding indices

FYI: The name of the column in the datamodel that contains the reblocked gvcfs is: hg38_reblocked_gvcf
FYI: The name of the column in the datamodel that contains the reblocked gvcf indices is: hg38_reblocked_gvcf_index



