In [None]:
# Version History
#print("Version 1.0.0: 09/15/2022 2:06pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/16/2022 3:10pm - Nate Calvanese - Shifted from transform to mapping compatibility")
#print("Version 1.0.2: 10/14/2022 7:40pm - Nate Calvanese - Added compatibility evaluation and support for multiple mapping specs")
#print("Version 1.0.3: 10/18/2022 1:33pm - Nate Calvanese - Encoded column names to match mapping specifications")
print("Version 1.0.4: 10/20/2022 11:50am - Nate Calvanese - Added ability to pull schemas for workspaces missing from workspace_schemas.csv")



In [None]:
#!pip install --upgrade import_ipynb

# Main Script

In [3]:
## Imports and environment variables
# imports
import import_ipynb
import pandas as pd
import json
import re
import os
from google.cloud import storage
from firecloud import api as fapi
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq

# Configure pandas display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

print(f"workspace name = {ws_name}")
print(f"workspace project = {ws_project}")
print(f"workspace bucket = {ws_bucket}")
print(f"workspace bucket name = {ws_bucket_name}")

workspace name = anvil_workspace_ingest_resources_dev
workspace project = dsp-data-ingest
workspace bucket = gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03
workspace bucket name = fc-2a9eefc3-0302-427f-9ac3-82f078741c03


In [4]:
## Inputs:

# Mapping specification to evaluate
mapping_target = "anvil"
mapping_target_spec_list = ["cmg_ext_2", "cmg_ext_3", "cmg_ext_4", "gtex_ext_2", "gtex_ext_3"]

# Any known data_file_refs, so file ref fields can be properly evaluated
data_file_refs = {   
    "sequencing": [{
        "column": "sequencing_id",
        "method": "file_path_match",
        "match_multiple_files": True, 
        "match_regex": None,
        "create_new_field": True,
        "new_field_name": "sequencing_id_file_id"
    }]
}

In [5]:
# Read the workspace_schemas.csv file into dataframe, clean column names, and convert to dict
ws_schema_filepath = ws_bucket + '/ingest_pipeline/resources/mapping_compatibility/workspace_schemas.csv'
df = pd.read_csv(ws_schema_filepath)
#int_dict = df[df["workspace_name"] == 'anvil_ccdg_asc_ndd_daly_talkowski_aleksic_asd_exome'].to_dict(orient="records")
int_dict = df.to_dict(orient="records")

# Derived pipeline tables: Should be added to every source workspace schema
file_inv_dict = {"name": "file_inventory",
                 "columns": [
                     {"name": "file_id"},
                     {"name": "name"},
                     {"name": "path"},
                     {"name": "uri"},
                     {"name": "content_type"},
                     {"name": "full_extension"},
                     {"name": "size_in_bytes"},
                     {"name": "crc32c"},
                     {"name": "md5_hash"},
                     {"name": "file_ref"}
                 ]}
ws_attr_dict = {"name": "workspace_attributes",
                 "columns": [
                     {"name": "attribute"},
                     {"name": "value"}
                 ]}

# Re-organize workspace dict into TDR-like schema to enable use of existing functions
workspace_dict = {}
for idx, record in enumerate(int_dict):
    if idx == 0:
        current_workspace = record["workspace_name"]
        previous_workspace = record["workspace_name"]
        current_table = record["table_name"]
        previous_table = record["table_name"]
        current_column = utils.encode_name(record["column_name"])
        table_list = []
        table_dict = {}
        table_dict["name"] = current_table
        table_dict["columns"] = []
        column_set = set()
        column_dict = {}
        entity_column = record["table_name"] + "_id"
        column_dict["name"] = entity_column
        column_set.add(entity_column)
        table_dict["columns"].append(column_dict)
        column_dict = {}
        column_dict["name"] = current_column
        column_set.add(current_column)
        table_dict["columns"].append(column_dict)
    else:
        current_workspace = record["workspace_name"]
        current_table = record["table_name"]
        current_column = utils.encode_name(record["column_name"])
        if current_workspace != previous_workspace:
            table_list.append(table_dict)
            table_list.append(file_inv_dict)
            table_list.append(ws_attr_dict)
            workspace_dict[previous_workspace] = {}
            workspace_dict[previous_workspace]["tables"] = table_list
            table_list = []
            table_dict = {}
            column_set = set()
            table_dict["name"] = current_table
            table_dict["columns"] = []
            column_dict = {}
            entity_column = record["table_name"] + "_id"
            column_dict["name"] = entity_column
            column_set.add(entity_column)
            table_dict["columns"].append(column_dict)
            column_dict = {}
            column_dict["name"] = current_column
            column_set.add(current_column)
            table_dict["columns"].append(column_dict)
        else:
            if current_table != previous_table:
                for key, value in data_file_refs.items():
                    if key.split(".")[0] == previous_table:
                        for entry in value:
                            if utils.encode_name(entry["column"]) in column_set:
                                if entry["create_new_field"] == True:
                                    column_dict = {}
                                    column_dict["name"] = utils.encode_name(entry["new_field_name"])
                                    column_set.add(utils.encode_name(entry["new_field_name"]))
                                    table_dict["columns"].append(column_dict)
                table_list.append(table_dict)
                table_dict = {}
                column_set = set()
                table_dict["name"] = current_table
                table_dict["columns"] = []
                column_dict = {}
                entity_column = record["table_name"] + "_id"
                column_dict["name"] = entity_column
                column_set.add(entity_column)
                table_dict["columns"].append(column_dict)
                column_dict = {}
                column_dict["name"] = current_column
                column_set.add(current_column)
                table_dict["columns"].append(column_dict)
                previous_table = current_table
            else:
                column_dict = {}
                column_dict["name"] = current_column
                column_set.add(current_column)
                table_dict["columns"].append(column_dict)
        previous_workspace = current_workspace
        previous_table = current_table
    if idx == len(int_dict)-1:
        for key, value in data_file_refs.items():
            if key.split(".")[0] == previous_table:
                for entry in value:
                    if utils.encode_name(entry["column"]) in column_set:
                        if entry["create_new_field"] == True:
                            column_dict = {}
                            column_dict["name"] = utils.encode_name(entry["new_field_name"])
                            column_set.add(utils.encode_name(entry["new_field_name"]))
                            table_dict["columns"].append(column_dict)
        table_list.append(table_dict)
        table_list.append(file_inv_dict)
        table_list.append(ws_attr_dict)
        workspace_dict[previous_workspace] = {}
        workspace_dict[previous_workspace]["tables"] = table_list
#print(json.dumps(workspace_dict))
                              
# Read in target schema
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
target_schema_dict = {}
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
#print(json.dumps(target_schema_dict))

# Loop through mapping specifications for evaluation
spec_dict = {}
for mapping_target_spec in mapping_target_spec_list:


    # Read in mapping specification
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    mapping_spec = {}
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", "Dataset").replace("$PROJECT_NAME", "Project") #UPDATE WITH REAL PARAMETERS
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))
    #print(json.dumps(mapping_spec))

    # Loop through workspaces and evaluate mapping compatibility 
    detail_dict = {}
    for ws_key in workspace_dict:
        entity_list = []
        for entity in mapping_spec["entities"]:
            entity_dict = {}
            record_set_list = []
            target_table = {}
            for table in target_schema_dict["tables"]:
                if table["name"] == entity["name"]:
                    target_table = table
            if target_table:
                for record_set in entity["record_sets"]:
                    record_set_dict = {}
                    record_set_dict["record_set"] = record_set["name"]
                    if bmq.validate_record_set(record_set, workspace_dict[ws_key], target_table):
                        record_set_dict["can_run"] = True
                    else:
                        record_set_dict["can_run"] = False
                    record_set_dict["total_attrs"] = len(record_set["attributes"])
                    valid_attr_count = 0
                    invalid_attrs_set = set()
                    for attribute in record_set["attributes"]:
                        if bmq.validate_attribute(attribute, workspace_dict[ws_key], target_table):
                            valid_attr_count += 1
                        else:
                            invalid_attrs_set.add(target_table["name"] + "." + attribute["name"])
                    record_set_dict["valid_attrs"] = valid_attr_count
                    record_set_dict["invalid_attrs_set"] = list(invalid_attrs_set)
                    record_set_list.append(record_set_dict)
                entity_dict[entity["name"]] = record_set_list
                entity_list.append(entity_dict)
        detail_dict[ws_key] = {}
        detail_dict[ws_key]["entities"] = entity_list
    #print(json.dumps(detail_dict))
    spec_dict[mapping_target_spec] = {}
    spec_dict[mapping_target_spec]["detail_dict"] = detail_dict

    # Collect target tables and columns not in mapping specification
    missing_table_set = set()
    missing_column_set = set()
    entity_table_list = [val["name"] for val in mapping_spec["entities"]]
    entity_column_list = []
    for entity in mapping_spec["entities"]:
        entity_name = entity["name"]
        for record_set in entity["record_sets"]:
            for attribute in record_set["attributes"]:
                attribute_name = entity_name + "." + attribute["name"]
                entity_column_list.append(attribute_name)
    for table_entry in target_schema_dict["tables"]:
        if table_entry["name"] not in entity_table_list:
            missing_table_set.add(table_entry["name"])
        else:
            for column_entry in table_entry["columns"]:
                column_name = table_entry["name"] + "." + column_entry["name"]
                if column_name not in entity_column_list:
                    missing_column_set.add(column_name)
    spec_dict[mapping_target_spec]["missing_table_set"] = missing_table_set
    spec_dict[mapping_target_spec]["missing_column_set"] = missing_column_set

# Summarize mapping compatibility
results_list = []
for spec_key, spec_val in spec_dict.items():
    for ws_key, value in spec_val["detail_dict"].items():
        workspace_results_list = []
        can_run_set = set()
        can_run_fully_set = set()
        sum_valid_attrs = 0
        sum_total_attrs = 0
        invalid_attrs_list = []
        for entities in value["entities"]:
            max_valid_attrs = 0
            max_total_attrs = 0
            for key, val in entities.items():
                invalid_attrs_set = set(val[0]["invalid_attrs_set"])
                for record_sets in val:
                    if record_sets["can_run"] == True:
                        can_run_set.add(key)
                    if record_sets["total_attrs"] == record_sets["valid_attrs"]:
                        can_run_fully_set.add(key)
                    if record_sets["valid_attrs"] > max_valid_attrs:
                        max_valid_attrs = record_sets["valid_attrs"]
                    if record_sets["total_attrs"] > max_total_attrs:
                        max_total_attrs = record_sets["total_attrs"]
                    invalid_attrs_set = invalid_attrs_set.union(set(record_sets["invalid_attrs_set"]))
                sum_valid_attrs += max_valid_attrs
                sum_total_attrs += max_total_attrs
                invalid_attrs_list.extend(list(invalid_attrs_set))
        percent_valid_attrs = round(sum_valid_attrs/sum_total_attrs,2)
        if "anvil_donor" in can_run_set and ("anvil_biosample" in can_run_set and "anvil_biosample.donor_id" not in invalid_attrs_list) and (("anvil_activity" in can_run_set and "anvil_activity.used_biosample_id" not in invalid_attrs_list) or ("anvil_sequencingactivity" in can_run_set and "anvil_sequencingactivity.used_biosample_id" not in invalid_attrs_list)):
            compatible = "Y"
        else:
            compatible = "N"
        workspace_results_list.append(ws_key)
        workspace_results_list.append(spec_key)
        workspace_results_list.append(compatible)
        workspace_results_list.append(len(can_run_set))
        workspace_results_list.append(can_run_set)
        workspace_results_list.append(len(can_run_fully_set))
        workspace_results_list.append(can_run_fully_set)
        workspace_results_list.append(sum_valid_attrs)
        workspace_results_list.append(percent_valid_attrs)
        workspace_results_list.append(invalid_attrs_list)
        results_list.append(workspace_results_list)

results_df = pd.DataFrame(results_list, columns = ['workspace', 'mapping_spec', 'compatible', 'can_run_count', 'can_run_entities', 'can_fully_run_count', 'can_fully_run_entities', 'cnt_valid_attrs', 'perc_valid_attrs', 'invalid_attr_list'])

# Sort results dataframe and write out to tsv
destination_dir = "ingest_pipeline/resources/mapping_compatibility/output"
sorted_df = results_df.sort_values(['mapping_spec', 'compatible', 'can_run_count', 'perc_valid_attrs'], ascending=[True, False, False, False], ignore_index=True)
output_file = "mapping_compatibility_results.tsv"
sorted_df.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
!rm $output_file

# Aggregate compatible mapping specs and write out to tsv
agg_df = results_df[results_df["compatible"] == "Y"].sort_values(["can_fully_run_count", "can_run_count", "perc_valid_attrs", "mapping_spec"], ascending=[False, False, False, True]).groupby('workspace').agg(compatible_mapping_specs=('mapping_spec', 'unique')).reset_index()
output_file = "mapping_compatibility_aggregation.tsv"
agg_df.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
!rm $output_file

# Output results to the user
print("------------------------------------------------------------------------------------------------------")
print("Mapping Compatibility Results for Mapping Target Specifications:")
print("------------------------------------------------------------------------------------------------------")
print("Target tables not included in specifications:")
for spec_key, spec_val in spec_dict.items():
    print("\t" + spec_key + ": " + ", ".join(sorted(list(spec_val["missing_table_set"]))))
print("\n")
print("Target fields not included in specification:")
for spec_key, spec_val in spec_dict.items():
    print("\t" + spec_key + ": " + ", ".join(sorted(list(spec_val["missing_column_set"]))))
print("\n")
print("Workspace evaluation against specifications:")
display(sorted_df)


------------------------------------------------------------------------------------------------------
Mapping Compatibility Results for Mapping Target Specifications:
------------------------------------------------------------------------------------------------------
Target tables not included in specifications:
	cmg_ext_2: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	cmg_ext_3: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	cmg_ext_4: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	gtex_ext_2: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_diagnosis, anvil_sequencingactivity, anvil_variantcallingactivity
	gtex_ext_3: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_diagnosis, anvil_sequencingactivity, anvil_variantcallingactivity


Target fields not included in specification:
	cmg_ext_2: anvil_activity.source

Unnamed: 0,workspace,mapping_spec,compatible,can_run_count,can_run_entities,can_fully_run_count,can_fully_run_entities,cnt_valid_attrs,perc_valid_attrs,invalid_attr_list
0,anvil_cmg_broad_brain_engle_wes,cmg_ext_2,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
1,anvil_cmg_broad_genitourinary_sinclair_wes,cmg_ext_2,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
2,anvil_cmg_broad_orphan_jueppner_wes,cmg_ext_2,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
3,anvil_cmg_broad_genitourinary_hirschhorn_wes,cmg_ext_2,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
4,anvil_cmg_broad_orphan_scott_wes,cmg_ext_2,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
5,anvil_cmg_broad_orphan_vcgs-white_wes,cmg_ext_2,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
6,anvil_cmg_broad_stillbirth_wilkins-haug_wes,cmg_ext_2,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
7,anvil_cmg_broad_brain_engle_wes,cmg_ext_3,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
8,anvil_cmg_broad_genitourinary_sinclair_wes,cmg_ext_3,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"
9,anvil_cmg_broad_orphan_jueppner_wes,cmg_ext_3,Y,6,"{anvil_project, anvil_sequencingactivity, anvil_dataset, anvil_donor, anvil_biosample, anvil_file}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",31,0.7,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_upper_bound, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_lower_bound, anvil_diagnosis.onset_age_unit, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.disease, anvil_activity.generated_file_id]"


# Utility Scripts

## Pull results for specific workspaces

In [None]:
## Print detailed results for specific workspace
print(json.dumps(detail_dict["anvil_cmg_uwash_ds-hfa"], indent=2))

In [None]:
## Print workspace dict for specific workspace
print(json.dumps(workspace_dict["anvil_gtex_bcm_gru_corsivs"], indent=2))

## Checking workspaces for fileref fields

In [None]:
## Checking for fileref fields
ws_project = "anvil-datastorage"
data_file_refs = {}
ws_name_list = [
"1000G-high-coverage-2019"
]
file_ref_set = set()
for ws_name in ws_name_list:
    file_ref_list = []
    try:
        ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
        workspace_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else "" 
        file_ref_list, data_file_refs, remote_list = utils.find_and_add_fileref_fields(ws_project, ws_name, workspace_bucket, data_file_refs)
        for entry in file_ref_list:
            file_ref_set.add(entry)
    except:
        continue
print(sorted(list(file_ref_set)))



## Collecting schemas for workspaces

In [2]:
## Collecting the schema for a specific workspace
from firecloud import api as fapi
ws_project = "anvil-datastorage"
ws_name_list = [
'ANVIL_CMG_BROAD_BRAIN_ENGLE_WES',
'ANVIL_CMG_Broad_Genitourinary_Sinclair_WES',
'ANVIL_CMG_Broad_Orphan_Jueppner_WES',
'AnVIL_CMG_Broad_Genitourinary_Hirschhorn_WES',
'AnVIL_CMG_Broad_Orphan_Scott_WES',
'AnVIL_CMG_Broad_Orphan_VCGS-White_WES',
'AnVIL_CMG_Broad_Stillbirth_Wilkins-Haug_WES',
]
schema_fields = []

# Loop through workspaces
for ws_name in ws_name_list:

    try:
        # Collect and record all entity types in workspace
        response_etypes = fapi.list_entity_types(ws_project, ws_name)
        dict_all_etypes = json.loads(response_etypes.text)
        etypes_list = [key for key in dict_all_etypes.keys()]

        # Loop through entity types and parse result to build schema
        if etypes_list:
            for etype in etypes_list:
                column_set = set()
                column_set.add(dict_all_etypes[etype]["idName"])
                for attr_key in dict_all_etypes[etype]["attributeNames"]:
                    column_set.add(attr_key)
                for column in column_set:
                    column_entry = []
                    column_entry = [ws_name.lower(), etype.lower(), column.lower()]
                    schema_fields.append(column_entry)
    except:
        pass

# Convert to dataframe and display
df = pd.DataFrame(schema_fields, columns = ["workspace_name", "table_name", "column_name"])
display(df)

Unnamed: 0,workspace_name,table_name,column_name
0,anvil_cmg_broad_brain_engle_wes,subject,19-disease_description
1,anvil_cmg_broad_brain_engle_wes,subject,01-subject_id
2,anvil_cmg_broad_brain_engle_wes,subject,subject_id
3,anvil_cmg_broad_brain_engle_wes,subject,22-age_of_onset
4,anvil_cmg_broad_brain_engle_wes,subject,07-multiple_datasets
5,anvil_cmg_broad_brain_engle_wes,subject,20-affected_status
6,anvil_cmg_broad_brain_engle_wes,subject,11-twin_id
7,anvil_cmg_broad_brain_engle_wes,subject,14-ancestry
8,anvil_cmg_broad_brain_engle_wes,subject,02-prior_testing
9,anvil_cmg_broad_brain_engle_wes,subject,12-proband_relationship
