In [1]:
# Version History
#print("Version 1.0.0: 09/15/2022 2:06pm - Nate Calvanese - First version created")
print("Version 1.0.1: 09/16/2022 3:10pm - Nate Calvanese - Shifted from transform to mapping compatibility")


Version 1.0.1: 09/16/2022 3:10pm - Nate Calvanese - Shifted from transform to mapping compatibility


In [2]:
#!pip install --upgrade import_ipynb

In [1]:
## Imports and environment variables
# imports
import import_ipynb
import pandas as pd
import json
import re
import os
from google.cloud import storage
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq

# Configure pandas display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

print(f"workspace name = {ws_name}")
print(f"workspace project = {ws_project}")
print(f"workspace bucket = {ws_bucket}")
print(f"workspace bucket name = {ws_bucket_name}")

importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.7: 09/26/2022 4:35pm - Nate Calvanese - Added schema reconciliation functionality to TDR dataset patching
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.3: 09/23/2022 11:53am - Nate Calvanese - Appended source workspace to file names
importing Jupyter notebook from build_file_inventory.ipynb
Version 1.0.3: 10/03/2022 12:15pm - Nate Calvanese - Flattened target file path to help with ingest performance
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.1: 9/16/2022 10:57am - Nate Calvanese - Fixed bug in file_inventory table creation
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.5: 09/21/2022 11:58am - Nate Calvanese - Made multi-column array agg return array with distinct values
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.2: 09/14/2022 10:23am -- Made output directory and validation schema more configurable
workspac

In [2]:
## Inputs:

# Mapping specification to evaluate
mapping_target = "anvil"
mapping_target_spec = "cmg_ext_1"

# Any known data_file_refs, so file ref fields can be properly evaluated
data_file_refs = {   
    "ws_sequencing.tsv": [{
        "column": "sequencing_id",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "sequencing_id_file_id"
    }, {
        "column": "seq_filename",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "seq_filename_file_id"
    }, {
        "column": "capture_region_bed_file",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "capture_region_bed_file_file_id"
    }, {
        "column": "file_id",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "file_id_file_id"
    }, {
        "column": "cram",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "cram_file_id"
    }], 
    "ws_sample.tsv": [{
        "column": "crai",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "crai_file_id"
    }, {
        "column": "cram",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "cram_file_id"
    },{
        "column": "bai",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "bai_file_id"
    },{
        "column": "bam",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "bam_file_id"
    },{
        "column": "csi",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "csi_file_id"
    },{
        "column": "tbi",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "tbi_file_id"
    },{
        "column": "vcf",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "vcf_file_id"
    },{
        "column": "cram_id",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "cram_id_file_id"
    }]
}

In [3]:
# Read the workspace_schemas.csv file into dataframe, clean column names, and convert to dict
ws_schema_filepath = ws_bucket + '/ingest_pipeline/resources/mapping_compatibility/workspace_schemas.csv'
df = pd.read_csv(ws_schema_filepath)
int_dict = df[df["workspace_name"] == 'anvil_cmg_uwash_gru'].to_dict(orient="records")
#int_dict = df.to_dict(orient="records")

# Derived pipeline tables: Should be added to every source workspace schema
file_inv_dict = {"name": "ws_file_inventory",
                 "columns": [
                     {"name": "file_id"},
                     {"name": "name"},
                     {"name": "path"},
                     {"name": "uri"},
                     {"name": "content_type"},
                     {"name": "full_extension"},
                     {"name": "size_in_bytes"},
                     {"name": "crc32c"},
                     {"name": "md5_hash"},
                     {"name": "file_ref"}
                 ]}
ws_attr_dict = {"name": "ws_workspace_attributes",
                 "columns": [
                     {"name": "attribute"},
                     {"name": "value"}
                 ]}

# Re-organize workspace dict into TDR-like schema to enable use of existing functions
workspace_dict = {}
for idx, record in enumerate(int_dict):
    if idx == 0:
        current_workspace = record["workspace_name"]
        previous_workspace = record["workspace_name"]
        current_table = "ws_" + record["table_name"]
        previous_table = "ws_" + record["table_name"]
        current_column = record["column_name"]
        table_list = []
        table_dict = {}
        table_dict["name"] = current_table
        table_dict["columns"] = []
        column_set = set()
        column_dict = {}
        entity_column = record["table_name"] + "_id"
        column_dict["name"] = entity_column
        column_set.add(entity_column)
        table_dict["columns"].append(column_dict)
        column_dict = {}
        column_dict["name"] = current_column
        column_set.add(current_column)
        table_dict["columns"].append(column_dict)
    else:
        current_workspace = record["workspace_name"]
        current_table = "ws_" + record["table_name"]
        current_column = record["column_name"]
        if current_workspace != previous_workspace:
            table_list.append(table_dict)
            table_list.append(file_inv_dict)
            table_list.append(ws_attr_dict)
            workspace_dict[previous_workspace] = {}
            workspace_dict[previous_workspace]["tables"] = table_list
            table_list = []
            table_dict = {}
            column_set = set()
            table_dict["name"] = current_table
            table_dict["columns"] = []
            column_dict = {}
            entity_column = record["table_name"] + "_id"
            column_dict["name"] = entity_column
            column_set.add(entity_column)
            table_dict["columns"].append(column_dict)
            column_dict = {}
            column_dict["name"] = current_column
            column_set.add(current_column)
            table_dict["columns"].append(column_dict)
        else:
            if current_table != previous_table:
                for key, value in data_file_refs.items():
                    if key.split(".")[0] == previous_table:
                        for entry in value:
                            if entry["column"] in column_set:
                                if entry["create_new_field"] == True:
                                    column_dict = {}
                                    column_dict["name"] = entry["new_field_name"]
                                    column_set.add(entry["new_field_name"])
                                    table_dict["columns"].append(column_dict)
                table_list.append(table_dict)
                table_dict = {}
                column_set = set()
                table_dict["name"] = current_table
                table_dict["columns"] = []
                column_dict = {}
                entity_column = record["table_name"] + "_id"
                column_dict["name"] = entity_column
                column_set.add(entity_column)
                table_dict["columns"].append(column_dict)
                column_dict = {}
                column_dict["name"] = current_column
                column_set.add(current_column)
                table_dict["columns"].append(column_dict)
                previous_table = current_table
            else:
                column_dict = {}
                column_dict["name"] = current_column
                column_set.add(current_column)
                table_dict["columns"].append(column_dict)
        previous_workspace = current_workspace
        previous_table = current_table
    if idx == len(int_dict)-1:
        for key, value in data_file_refs.items():
            if key.split(".")[0] == previous_table:
                for entry in value:
                    if entry["column"] in column_set:
                        if entry["create_new_field"] == True:
                            column_dict = {}
                            column_dict["name"] = entry["new_field_name"]
                            column_set.add(entry["new_field_name"])
                            table_dict["columns"].append(column_dict)
        table_list.append(table_dict)
        table_list.append(file_inv_dict)
        table_list.append(ws_attr_dict)
        workspace_dict[previous_workspace] = {}
        workspace_dict[previous_workspace]["tables"] = table_list
#print(json.dumps(workspace_dict))
                              
# Read in mapping specification and target schema
target_schema_dict = {}
mapping_spec = {}
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
#print(json.dumps(target_schema_dict))
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
    blob_string = blob.download_as_text(client=None)
    blob_string = blob_string.replace("$DATASET_NAME", "Dataset").replace("$PROJECT_NAME", "Project") #UPDATE WITH REAL PARAMETERS
    mapping_spec = json.loads(blob_string)
except Exception as e:
    print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))
#print(json.dumps(mapping_spec))

# Loop through workspaces and evaluate mapping compatibility 
detail_dict = {}
for ws_key in workspace_dict:
    entity_list = []
    for entity in mapping_spec["entities"]:
        entity_dict = {}
        record_set_list = []
        target_table = {}
        for table in target_schema_dict["tables"]:
            if table["name"] == entity["name"]:
                target_table = table
        if target_table:
            for record_set in entity["record_sets"]:
                record_set_dict = {}
                record_set_dict["record_set"] = record_set["name"]
                if bmq.validate_record_set(record_set, workspace_dict[ws_key], target_table):
                    record_set_dict["can_run"] = True
                else:
                    record_set_dict["can_run"] = False
                record_set_dict["total_attrs"] = len(record_set["attributes"])
                valid_attr_count = 0
                invalid_attrs_set = set()
                for attribute in record_set["attributes"]:
                    if bmq.validate_attribute(attribute, workspace_dict[ws_key], target_table):
                        valid_attr_count += 1
                    else:
                        invalid_attrs_set.add(target_table["name"] + "." + attribute["name"])
                record_set_dict["valid_attrs"] = valid_attr_count
                record_set_dict["invalid_attrs_set"] = list(invalid_attrs_set)
                record_set_list.append(record_set_dict)
            entity_dict[entity["name"]] = record_set_list
            entity_list.append(entity_dict)
    detail_dict[ws_key] = {}
    detail_dict[ws_key]["entities"] = entity_list
#print(json.dumps(detail_dict))

# Collect target tables and columns not in mapping specification
missing_table_set = set()
missing_column_set = set()
entity_table_list = [val["name"] for val in mapping_spec["entities"]]
entity_column_list = []
for entity in mapping_spec["entities"]:
    entity_name = entity["name"]
    for record_set in entity["record_sets"]:
        for attribute in record_set["attributes"]:
            attribute_name = entity_name + "." + attribute["name"]
            entity_column_list.append(attribute_name)
for table_entry in target_schema_dict["tables"]:
    if table_entry["name"] not in entity_table_list:
        missing_table_set.add(table_entry["name"])
    else:
        for column_entry in table_entry["columns"]:
            column_name = table_entry["name"] + "." + column_entry["name"]
            if column_name not in entity_column_list:
                missing_column_set.add(column_name)

# Summarize mapping compatibility
results_list = []
for ws_key, value in detail_dict.items():
    workspace_results_list = []
    can_run_set = set()
    can_run_fully_set = set()
    sum_valid_attrs = 0
    sum_total_attrs = 0
    invalid_attrs_list = []
    for entities in value["entities"]:
        max_valid_attrs = 0
        max_total_attrs = 0
        for key, val in entities.items():
            invalid_attrs_set = set(val[0]["invalid_attrs_set"])
            for record_sets in val:
                if record_sets["can_run"] == True:
                    can_run_set.add(key)
                if record_sets["total_attrs"] == record_sets["valid_attrs"]:
                    can_run_fully_set.add(key)
                if record_sets["valid_attrs"] > max_valid_attrs:
                    max_valid_attrs = record_sets["valid_attrs"]
                if record_sets["total_attrs"] > max_total_attrs:
                    max_total_attrs = record_sets["total_attrs"]
                invalid_attrs_set = invalid_attrs_set.union(set(record_sets["invalid_attrs_set"]))
            sum_valid_attrs += max_valid_attrs
            sum_total_attrs += max_total_attrs
            invalid_attrs_list.extend(list(invalid_attrs_set))
    percent_valid_attrs = round(sum_valid_attrs/sum_total_attrs,2)
    workspace_results_list.append(ws_key)
    workspace_results_list.append(len(can_run_set))
    workspace_results_list.append(can_run_set)
    workspace_results_list.append(len(can_run_fully_set))
    workspace_results_list.append(can_run_fully_set)
    workspace_results_list.append(sum_valid_attrs)
    workspace_results_list.append(percent_valid_attrs)
    workspace_results_list.append(invalid_attrs_list)
    results_list.append(workspace_results_list)

results_df = pd.DataFrame(results_list, columns = ['workspace', 'can_run_count', 'can_run_entities', 'can_fully_run_count', 'can_fully_run_entities', 'cnt_valid_attrs', 'perc_valid_attrs', 'invalid_attr_list'])

# Sort results dataframe and write out to tsv
sorted_df = results_df.sort_values(['can_run_count', 'perc_valid_attrs'], ascending=[False, False], ignore_index=True)
output_file = "mapping_compatibility_results.tsv"
destination_dir = "ingest_pipeline/resources/mapping_compatibility/output"
sorted_df.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
!rm $output_file

# Output results to the user
print("------------------------------------------------------------------------------------------------------")
print("Mapping Compatibility Results for Mapping Target Specification: {}/{}".format(mapping_target, mapping_target_spec))
print("------------------------------------------------------------------------------------------------------")
print("Target tables not included in specification:")
print("\t" + ", ".join(list(missing_table_set)))
print("\n")
print("Target fields not included in specification:")
print("\t" + ", ".join(list(missing_column_set)))
print("\n")
print("Workspace evaluation against specification:")
display(sorted_df)


------------------------------------------------------------------------------------------------------
Mapping Compatibility Results for Mapping Target Specification: anvil/cmg_ext_1
------------------------------------------------------------------------------------------------------
Target tables not included in specification:
	assayactivity, antibody, variantcallingactivity, alignmentactivity


Target fields not included in specification:
	donor.source_datarepo_row_ids, diagnosis.diagnosis_age_upper_bound, biosample.diagnosis_id, sequencingactivity.source_datarepo_row_ids, diagnosis.diagnosis_age_lower_bound, biosample.apriori_cell_type, project.funded_by, file.reference_assembly, biosample.donor_age_at_collection_lower_bound, activity.source_datarepo_row_ids, diagnosis.source_datarepo_row_ids, project.principal_investigator, donor.genetic_ancestry, sequencingactivity.assay_type, diagnosis.phenopacket, biosample.donor_age_at_collection_unit, biosample.donor_age_at_collection_upper_b

Unnamed: 0,workspace,can_run_count,can_run_entities,can_fully_run_count,can_fully_run_entities,cnt_valid_attrs,perc_valid_attrs,invalid_attr_list
0,anvil_cmg_uwash_gru,7,"{sequencingactivity, diagnosis, dataset, project, biosample, file, donor}",6,"{sequencingactivity, dataset, project, biosample, file, donor}",35,0.9,"[diagnosis.onset_age_lower_bound, diagnosis.onset_age_upper_bound, diagnosis.onset_age_unit, activity.generated_file_id]"


In [5]:
# Print detailed results for specific workspace
print(json.dumps(detail_dict["anvil_cmg_uwash_ds-hfa"], indent=2))

{
  "entities": [
    {
      "donor": [
        {
          "record_set": "default",
          "can_run": true,
          "total_attrs": 5,
          "valid_attrs": 3,
          "invalid_attrs_set": [
            "donor.reported_ethnicity",
            "donor.diagnosis_id"
          ]
        }
      ]
    },
    {
      "biosample": [
        {
          "record_set": "default",
          "can_run": false,
          "total_attrs": 5,
          "valid_attrs": 2,
          "invalid_attrs_set": [
            "biosample.biosample_type",
            "biosample.anatomical_site",
            "biosample.biosample_id"
          ]
        }
      ]
    },
    {
      "diagnosis": [
        {
          "record_set": "default",
          "can_run": false,
          "total_attrs": 6,
          "valid_attrs": 1,
          "invalid_attrs_set": [
            "diagnosis.onset_age_unit",
            "diagnosis.disease",
            "diagnosis.onset_age_lower_bound",
            "diagnosis.diagnosis_id

In [8]:
# Print workspace dict for specific workspace
print(json.dumps(workspace_dict["anvil_gtex_bcm_gru_corsivs"], indent=2))

{
  "tables": [
    {
      "name": "ws_sample",
      "columns": [
        {
          "name": "sample_id"
        },
        {
          "name": "dbgap_sample_id"
        },
        {
          "name": "sample_provider"
        },
        {
          "name": "subject_id"
        },
        {
          "name": "submitter_id"
        }
      ]
    },
    {
      "name": "ws_sequencing",
      "columns": [
        {
          "name": "sequencing_id"
        },
        {
          "name": "alignment_method"
        },
        {
          "name": "analyte_type"
        },
        {
          "name": "capture_region_bed_file"
        },
        {
          "name": "data_processing_pipeline"
        },
        {
          "name": "date_data_generation"
        },
        {
          "name": "exome_capture_platform"
        },
        {
          "name": "fastq"
        },
        {
          "name": "functional_equivalence_standard"
        },
        {
          "name": "library_prep_kit_m