In [None]:
# Version History
#print("Version 1.0.0: 09/15/2022 2:06pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/16/2022 3:10pm - Nate Calvanese - Shifted from transform to mapping compatibility")
#print("Version 1.0.2: 10/14/2022 7:40pm - Nate Calvanese - Added compatibility evaluation and support for multiple mapping specs")
#print("Version 1.0.3: 10/18/2022 1:33pm - Nate Calvanese - Encoded column names to match mapping specifications")
#print("Version 1.0.4: 10/20/2022 11:50am - Nate Calvanese - Added ability to pull schemas for workspaces missing from workspace_schemas.csv")
print("Version 1.0.4: 10/20/2022 11:50am - Nate Calvanese - Improved algorithm and added a compatibility score")


In [None]:
#!pip install --upgrade import_ipynb

# Main Script

## Imports and Helpers

In [1]:
## Imports and environment variables
# imports
import import_ipynb
import pandas as pd
import json
import re
import os
from google.cloud import storage
from firecloud import api as fapi
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq
import data_repo_client

# Configure pandas display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)


importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.45: 10/18/2024 2:09pm - Nate Calvanese - Fixed performance bug with find_and_add_fileref_fields function.
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.4: 10/18/2024 2:19pm - Nate Calvanese - Updated get_objects_list function to not use fuzzy matching for full file paths
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.10: 1/12/2024 11:25am - Nate Calvanese - Made max_combined_rec_ref_size configurable
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.17: 9/19/2024 3:51pm - Nate Calvanese - Tweaked VOCAB_MAP function to trim whitespace when joining non-array values to the vocab table
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.8: 9/20/2024 9:06pm -- Added high-priority flags

## Parameters

In [2]:
## Inputs:

# Mapping specification to evaluate
mapping_target = "anvil"
mapping_target_spec_list = ["cmg_ext_2", "cmg_ext_3", "cmg_ext_4", "cmg_ext_5", "gtex_ext_2", "gtex_ext_3", "gregor_1"]

# Any known data_file_refs, so file ref fields can be properly evaluated
data_file_refs = {   
}

# Dataset IDs to evaluate
dataset_id_list = [
    'e293ce2d-af17-4fb0-a84b-47078830c898',
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    '65793118-3c88-4185-9172-2354850e6056',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    'ecd0e3b1-a177-4487-8e33-0084688cf148',
    'b2b217c2-4b68-4820-bf9d-e2927bfe8706',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    'ac48514d-0b01-4a92-b164-821fa3e05d7a',
]

## Script Execution

In [3]:
# Establish API clients
api_client = utils.refresh_tdr_api_client()
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through the dataset_ids to evaluate and pull out the existing TDR schemas
input_datasets_dict = {}
for dataset_id in dataset_id_list:
    
    # Retrieve source schema
    dataset_table_list = []
    dataset_rels_list = []
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        for table in response["schema"]["tables"]:
            if "anvil_" in table["name"]:
                continue
            table_dict = {}
            table_dict["name"] = table["name"]
            table_dict["columns"] = []
            for column in table["columns"]:
                table_dict["columns"].append({"name": column["name"]})
            dataset_table_list.append(table_dict)
        input_datasets_dict[dataset_id] = {}
        input_datasets_dict[dataset_id]["tables"] = dataset_table_list
        for relationship in response["schema"]["relationships"]:
            if "anvil_" in relationship["_from"]["table"]:
                continue
            dataset_rels_list.append([relationship["_from"]["table"], relationship["to"]["table"]])
        input_datasets_dict[dataset_id]["relationships"] = dataset_rels_list
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))
#print(input_datasets_dict)

# Read in target schema
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
target_schema_dict = {}
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
#print(json.dumps(target_schema_dict))

# Loop through mapping specifications for evaluation
spec_dict = {}
for mapping_target_spec in mapping_target_spec_list:


    # Read in mapping specification
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    mapping_spec = {}
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", "Dataset").replace("$PROJECT_NAME", "Project") #UPDATE WITH REAL PARAMETERS
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))
    #print(json.dumps(mapping_spec))

    # Loop through datasets and evaluate mapping compatibility 
    detail_dict = {}
    for ds_key in input_datasets_dict:
        entity_list = []
        for entity in mapping_spec["entities"]:
            entity_dict = {}
            record_set_list = []
            target_table = {}
            for table in target_schema_dict["tables"]:
                if table["name"] == entity["name"]:
                    target_table = table
            if target_table:
                for record_set in entity["record_sets"]:
                    record_set_dict = {}
                    record_set_dict["record_set"] = record_set["name"]
                    if bmq.validate_record_set(record_set, input_datasets_dict[ds_key], target_table):
                        record_set_dict["can_run"] = True
                    else:
                        record_set_dict["can_run"] = False
                    record_set_dict["total_attrs"] = len(record_set["attributes"])
                    valid_attr_count = 0
                    invalid_attrs_set = set()
                    base_table = "Unspecified"
                    for attribute in record_set["attributes"]:
                        if attribute["name"] == target_table["name"].replace("anvil_", "") + "_id":
                            try:
                                base_table = attribute["source"]["fields"][0].split(".")[0]
                            except:
                                pass
                        if bmq.validate_attribute(attribute, input_datasets_dict[ds_key], target_table):
                            valid_attr_count += 1
                        else:
                            invalid_attrs_set.add(target_table["name"] + "." + attribute["name"])
                    record_set_dict["valid_attrs"] = valid_attr_count
                    record_set_dict["invalid_attrs_set"] = list(invalid_attrs_set)
                    record_set_dict["base_table"] = base_table
                    record_set_list.append(record_set_dict)
                entity_dict[entity["name"]] = record_set_list
                entity_list.append(entity_dict)
        detail_dict[ds_key] = {}
        detail_dict[ds_key]["entities"] = entity_list
    #print(json.dumps(detail_dict))
    spec_dict[mapping_target_spec] = {}
    spec_dict[mapping_target_spec]["detail_dict"] = detail_dict

    # Collect target tables and columns not in mapping specification
    missing_table_set = set()
    missing_column_set = set()
    entity_table_list = [val["name"] for val in mapping_spec["entities"]]
    entity_column_list = []
    for entity in mapping_spec["entities"]:
        entity_name = entity["name"]
        for record_set in entity["record_sets"]:
            for attribute in record_set["attributes"]:
                attribute_name = entity_name + "." + attribute["name"]
                entity_column_list.append(attribute_name)
    for table_entry in target_schema_dict["tables"]:
        if table_entry["name"] not in entity_table_list:
            missing_table_set.add(table_entry["name"])
        else:
            for column_entry in table_entry["columns"]:
                column_name = table_entry["name"] + "." + column_entry["name"]
                if column_name not in entity_column_list:
                    missing_column_set.add(column_name)
    spec_dict[mapping_target_spec]["missing_table_set"] = missing_table_set
    spec_dict[mapping_target_spec]["missing_column_set"] = missing_column_set

# Summarize mapping compatibility
results_list = []
for spec_key, spec_val in spec_dict.items():
    for ds_key, value in spec_val["detail_dict"].items():
        dataset_results_list = []
        can_run_set = set()
        can_run_fully_set = set()
        sum_valid_attrs = 0
        sum_total_attrs = 0
        invalid_attrs_list = []
        seqactivity_source = set()
        activity_source = set()
        biosample_source = set()
        donor_source = set()
        diagnosis_source = set()
        seqactivity_biosample_ri = False
        activity_biosample_ri = False
        biosample_donor_ri = False
        diagnosis_donor_ri = False
        for entities in value["entities"]:
            max_valid_attrs = 0
            max_total_attrs = 0
            for key, val in entities.items():
                invalid_attrs_set = set(val[0]["invalid_attrs_set"])
                for record_sets in val:
                    if record_sets["can_run"] == True:
                        can_run_set.add(key)
                        if key == "anvil_sequencingactivity":
                            seqactivity_source.add(record_sets["base_table"])
                        elif key == "anvil_activity":
                            activity_source.add(record_sets["base_table"])
                        elif key == "anvil_biosample":
                            biosample_source.add(record_sets["base_table"])
                        elif key == "anvil_donor":
                            donor_source.add(record_sets["base_table"])
                        elif key == "anvil_diagnosis":
                            diagnosis_source.add(record_sets["base_table"])
                    if record_sets["total_attrs"] == record_sets["valid_attrs"]:
                        can_run_fully_set.add(key)
                    if record_sets["valid_attrs"] > max_valid_attrs:
                        max_valid_attrs = record_sets["valid_attrs"]
                    if record_sets["total_attrs"] > max_total_attrs:
                        max_total_attrs = record_sets["total_attrs"]
                    invalid_attrs_set = invalid_attrs_set.union(set(record_sets["invalid_attrs_set"]))
                sum_valid_attrs += max_valid_attrs
                sum_total_attrs += max_total_attrs
                invalid_attrs_list.extend(list(invalid_attrs_set))  
        percent_valid_attrs = round(sum_valid_attrs/sum_total_attrs,2)
        # Check RI between fields
        if seqactivity_source and biosample_source:
            for sa in seqactivity_source:
                for bio in biosample_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if sa == bio or (sa in rel_pair and bio in rel_pair):
                            seqactivity_biosample_ri = True
                            break
                    if seqactivity_biosample_ri:
                        break
                if seqactivity_biosample_ri:
                    break
        if activity_source and biosample_source:
            for act in activity_source:
                for bio in biosample_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if act == bio or (act in rel_pair and bio in rel_pair):
                            activity_biosample_ri = True
                            break
                    if activity_biosample_ri:
                        break
                if activity_biosample_ri:
                    break
        if biosample_source and donor_source:
            for bio in biosample_source:
                for don in donor_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if don == bio or (don in rel_pair and bio in rel_pair):
                            biosample_donor_ri = True
                            break
                    if biosample_donor_ri:
                        break
                if biosample_donor_ri:
                    break
        if diagnosis_source and donor_source:
            for diag in diagnosis_source:
                for don in donor_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if diag == don or (diag in rel_pair and don in rel_pair):
                            diagnosis_donor_ri = True
                            break
                    if diagnosis_donor_ri:
                        break
                if diagnosis_donor_ri:
                    break
        # Score compatibility
        compatibility_score = 0
        if "anvil_donor" in can_run_set:
            compatibility_score += 25
        if "anvil_biosample" in can_run_set:
            compatibility_score += 5
            if "anvil_biosample.donor_id" not in invalid_attrs_list and biosample_donor_ri:
                compatibility_score += 20
        if "anvil_diagnosis" in can_run_set:
            compatibility_score += 5
            if "anvil_diagnosis.donor_id" not in invalid_attrs_list and diagnosis_donor_ri:
                compatibility_score += 20
        if "anvil_activity" in can_run_set or "anvil_sequencingactivity" in can_run_set:
            compatibility_score += 5
            if ("anvil_activity" in can_run_set and "anvil_activity.used_biosample_id" not in invalid_attrs_list and activity_biosample_ri) or ("anvil_sequencingactivity" in can_run_set and "anvil_sequencingactivity.used_biosample_id" not in invalid_attrs_list and seqactivity_biosample_ri):
                compatibility_score += 20
        dataset_results_list.append(ds_key)
        dataset_results_list.append(spec_key)
        dataset_results_list.append(compatibility_score)
        dataset_results_list.append(len(can_run_set))
        dataset_results_list.append(can_run_set)
        dataset_results_list.append(len(can_run_fully_set))
        dataset_results_list.append(can_run_fully_set)
        dataset_results_list.append(sum_valid_attrs)
        dataset_results_list.append(percent_valid_attrs)
        dataset_results_list.append(invalid_attrs_list)
        dataset_results_list.append(seqactivity_source)
        dataset_results_list.append(activity_source)
        dataset_results_list.append(biosample_source)
        dataset_results_list.append(donor_source)
        dataset_results_list.append(diagnosis_source)
        dataset_results_list.append(seqactivity_biosample_ri)
        dataset_results_list.append(activity_biosample_ri)
        dataset_results_list.append(biosample_donor_ri)
        dataset_results_list.append(diagnosis_donor_ri)
        results_list.append(dataset_results_list)

results_df = pd.DataFrame(results_list, columns = ['dataset_id', 'mapping_spec', 'compatibility_score', 'can_run_count', 'can_run_entities', 'can_fully_run_count', 'can_fully_run_entities', 'cnt_valid_attrs', 'perc_valid_attrs', 'invalid_attr_list', 'seqactivity_source', 'activity_source', 'biosample_source', 'donor_source', 'diagnosis_source', 'seqactivity_biosample_ri', 'activity_biosample_ri', 'biosample_donor_ri', 'diagnosis_donor_ri'])

# Sort results dataframe and write out to tsv
destination_dir = "ingest_pipeline/resources/mapping_compatibility/output"
sorted_df = results_df.sort_values(['dataset_id', 'compatibility_score', 'can_run_count', 'perc_valid_attrs'], ascending=[True, False, False, False], ignore_index=True)
output_file = "mapping_compatibility_results.tsv"
sorted_df.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
!rm $output_file

# Aggregate compatible mapping specs and write out to tsv
agg_df = results_df[results_df["compatibility_score"] >= 60].sort_values(["dataset_id", "can_fully_run_count", "can_run_count", "perc_valid_attrs", "mapping_spec"], ascending=[True, False, False, False, True]).groupby('dataset_id').agg(compatible_mapping_specs=('mapping_spec', 'unique')).reset_index()
output_file = "mapping_compatibility_aggregation.tsv"
agg_df.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
!rm $output_file

# Output results to the user
print("------------------------------------------------------------------------------------------------------")
print("Mapping Compatibility Results for Mapping Target Specifications:")
print("------------------------------------------------------------------------------------------------------")
print("Target tables not included in specifications:")
for spec_key, spec_val in spec_dict.items():
    print("\t" + spec_key + ": " + ", ".join(sorted(list(spec_val["missing_table_set"]))))
print("\n")
print("Target fields not included in specification:")
for spec_key, spec_val in spec_dict.items():
    print("\t" + spec_key + ": " + ", ".join(sorted(list(spec_val["missing_column_set"]))))
print("\n")
print("Dataset evaluation against specifications:")
display(sorted_df)

------------------------------------------------------------------------------------------------------
Mapping Compatibility Results for Mapping Target Specifications:
------------------------------------------------------------------------------------------------------
Target tables not included in specifications:
	cmg_ext_2: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	cmg_ext_3: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	cmg_ext_4: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	cmg_ext_5: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	gtex_ext_2: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_sequencingactivity, anvil_variantcallingactivity
	gtex_ext_3: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_sequencingactivity, anvil_variantcallingactivity
	gregor_1: 

Unnamed: 0,dataset_id,mapping_spec,compatibility_score,can_run_count,can_run_entities,can_fully_run_count,can_fully_run_entities,cnt_valid_attrs,perc_valid_attrs,invalid_attr_list,seqactivity_source,activity_source,biosample_source,donor_source,diagnosis_source,seqactivity_biosample_ri,activity_biosample_ri,biosample_donor_ri,diagnosis_donor_ri
0,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,cmg_ext_2,80,7,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_diagnosis, anvil_activity, anvil_project}",5,"{anvil_donor, anvil_dataset, anvil_file, anvil_activity, anvil_project}",39,0.87,"[anvil_biosample.biosample_type, anvil_biosample.anatomical_site, anvil_diagnosis.phenotype, anvil_sequencingactivity.used_biosample_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.sequencingactivity_id]",{},{sample},{sample},{subject},{subject},False,True,False,True
1,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,cmg_ext_3,80,7,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_diagnosis, anvil_activity, anvil_project}",5,"{anvil_donor, anvil_dataset, anvil_file, anvil_activity, anvil_project}",38,0.84,"[anvil_biosample.anatomical_site, anvil_biosample.biosample_type, anvil_biosample.donor_id, anvil_diagnosis.phenotype, anvil_sequencingactivity.used_biosample_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.sequencingactivity_id]",{},{sample},{sample},{subject},{subject},False,True,False,True
2,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,cmg_ext_4,80,7,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_diagnosis, anvil_activity, anvil_project}",5,"{anvil_donor, anvil_dataset, anvil_file, anvil_activity, anvil_project}",38,0.84,"[anvil_biosample.anatomical_site, anvil_biosample.biosample_type, anvil_biosample.donor_id, anvil_diagnosis.phenotype, anvil_sequencingactivity.used_biosample_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.sequencingactivity_id]",{},{sample},{sample},{subject},{subject},False,True,False,True
3,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,cmg_ext_5,80,7,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_diagnosis, anvil_activity, anvil_project}",5,"{anvil_donor, anvil_dataset, anvil_file, anvil_activity, anvil_project}",38,0.84,"[anvil_biosample.anatomical_site, anvil_biosample.biosample_type, anvil_biosample.donor_id, anvil_diagnosis.phenotype, anvil_sequencingactivity.used_biosample_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.sequencingactivity_id]",{},{sample},{sample},{subject},{subject},False,True,False,True
4,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,gtex_ext_2,75,6,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_activity, anvil_project}",4,"{anvil_file, anvil_dataset, anvil_project, anvil_activity}",28,0.78,"[anvil_donor.phenotypic_sex, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_activity.generated_file_id]",{},{sample},{sample},{participant},{},False,True,True,False
5,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,gtex_ext_3,75,6,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_activity, anvil_project}",4,"{anvil_file, anvil_dataset, anvil_project, anvil_activity}",28,0.78,"[anvil_donor.phenotypic_sex, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_activity.generated_file_id]",{},{sample},{sample},{participant},{},False,True,True,False
6,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,gregor_1,25,4,"{anvil_donor, anvil_file, anvil_dataset, anvil_project}",3,"{anvil_file, anvil_dataset, anvil_project}",24,0.59,"[anvil_donor.reported_ethnicity, anvil_donor.phenotypic_sex, anvil_diagnosis.donor_id, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_id, anvil_activity.activity_id, anvil_activity.used_biosample_id, anvil_activity.generated_file_id, anvil_variantcallingactivity.used_file_id, anvil_variantcallingactivity.generated_file_id, anvil_variantcallingactivity.variantcallingactivity_id]",{},{},{},{participant},{},False,False,False,False
7,15be288e-53e1-41cb-8d20-8ea87efb9258,cmg_ext_2,80,7,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_diagnosis, anvil_activity, anvil_project}",5,"{anvil_donor, anvil_dataset, anvil_file, anvil_activity, anvil_project}",39,0.87,"[anvil_biosample.biosample_type, anvil_biosample.anatomical_site, anvil_diagnosis.phenotype, anvil_sequencingactivity.used_biosample_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.sequencingactivity_id]",{},{sample},{sample},{subject},{subject},False,True,False,True
8,15be288e-53e1-41cb-8d20-8ea87efb9258,cmg_ext_3,80,7,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_diagnosis, anvil_activity, anvil_project}",5,"{anvil_donor, anvil_dataset, anvil_file, anvil_activity, anvil_project}",38,0.84,"[anvil_biosample.anatomical_site, anvil_biosample.biosample_type, anvil_biosample.donor_id, anvil_diagnosis.phenotype, anvil_sequencingactivity.used_biosample_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.sequencingactivity_id]",{},{sample},{sample},{subject},{subject},False,True,False,True
9,15be288e-53e1-41cb-8d20-8ea87efb9258,cmg_ext_4,80,7,"{anvil_donor, anvil_dataset, anvil_file, anvil_biosample, anvil_diagnosis, anvil_activity, anvil_project}",5,"{anvil_donor, anvil_dataset, anvil_file, anvil_activity, anvil_project}",38,0.84,"[anvil_biosample.anatomical_site, anvil_biosample.biosample_type, anvil_biosample.donor_id, anvil_diagnosis.phenotype, anvil_sequencingactivity.used_biosample_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.sequencingactivity_id]",{},{sample},{sample},{subject},{subject},False,True,False,True


# Utility Scripts

In [None]:
## Print detailed results for specific dataset
print(json.dumps(detail_dict["anvil_cmg_uwash_ds-hfa"], indent=2))

## Pull results for specific datasets

In [None]:
## Print workspace dict for specific workspace
print(json.dumps(workspace_dict["anvil_gtex_bcm_gru_corsivs"], indent=2))

## Checking workspaces for fileref fields

In [None]:
## Checking for fileref fields
ws_project = "anvil-datastorage"
data_file_refs = {}
ws_name_list = [
"1000G-high-coverage-2019"
]
file_ref_set = set()
for ws_name in ws_name_list:
    file_ref_list = []
    try:
        ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
        workspace_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else "" 
        file_ref_list, data_file_refs, remote_list = utils.find_and_add_fileref_fields(ws_project, ws_name, workspace_bucket, data_file_refs)
        for entry in file_ref_list:
            file_ref_set.add(entry)
    except:
        continue
print(sorted(list(file_ref_set)))



## Collecting schemas for workspaces

In [None]:
## Collecting the schema for a specific workspace
from firecloud import api as fapi
ws_project = "anvil-datastorage"
ws_name_list = [
'ANVIL_CMG_BROAD_BRAIN_ENGLE_WES',
'ANVIL_CMG_Broad_Genitourinary_Sinclair_WES',
'ANVIL_CMG_Broad_Orphan_Jueppner_WES',
'AnVIL_CMG_Broad_Genitourinary_Hirschhorn_WES',
'AnVIL_CMG_Broad_Orphan_Scott_WES',
'AnVIL_CMG_Broad_Orphan_VCGS-White_WES',
'AnVIL_CMG_Broad_Stillbirth_Wilkins-Haug_WES',
]
schema_fields = []

# Loop through workspaces
for ws_name in ws_name_list:

    try:
        # Collect and record all entity types in workspace
        response_etypes = fapi.list_entity_types(ws_project, ws_name)
        dict_all_etypes = json.loads(response_etypes.text)
        etypes_list = [key for key in dict_all_etypes.keys()]

        # Loop through entity types and parse result to build schema
        if etypes_list:
            for etype in etypes_list:
                column_set = set()
                column_set.add(dict_all_etypes[etype]["idName"])
                for attr_key in dict_all_etypes[etype]["attributeNames"]:
                    column_set.add(attr_key)
                for column in column_set:
                    column_entry = []
                    column_entry = [ws_name.lower(), etype.lower(), column.lower()]
                    schema_fields.append(column_entry)
    except:
        pass

# Convert to dataframe and display
df = pd.DataFrame(schema_fields, columns = ["workspace_name", "table_name", "column_name"])
display(df)